Skip to main content

nucleus/security/
gvisor.rs

1use crate::error::{NucleusError, Result};
2use crate::oci::OciBundle;
3use nix::unistd::Uid;
4use std::ffi::CString;
5use std::os::unix::fs::PermissionsExt;
6use std::path::Path;
7use std::process::Command;
8use tracing::{debug, info};
9
10/// Network mode for gVisor runtime.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum GVisorNetworkMode {
13    /// No networking (fully isolated). Default for agent workloads.
14    None,
15    /// gVisor user-space network stack. Suitable for networked production services
16    /// that need gVisor isolation with network access.
17    Sandbox,
18    /// Share host network namespace. Use with caution.
19    Host,
20}
21
22/// Platform backend for gVisor's Sentry.
23#[derive(
24    Debug,
25    Clone,
26    Copy,
27    PartialEq,
28    Eq,
29    Default,
30    clap::ValueEnum,
31    serde::Serialize,
32    serde::Deserialize,
33)]
34pub enum GVisorPlatform {
35    /// systrap backend, the current default and most broadly compatible option.
36    #[default]
37    Systrap,
38    /// KVM-backed sandboxing for the Sentry itself.
39    Kvm,
40    /// ptrace backend for maximal compatibility where systrap/KVM are unavailable.
41    Ptrace,
42}
43
44impl GVisorPlatform {
45    pub fn as_flag(self) -> &'static str {
46        match self {
47            Self::Systrap => "systrap",
48            Self::Kvm => "kvm",
49            Self::Ptrace => "ptrace",
50        }
51    }
52}
53
54/// GVisor runtime manager
55///
56/// Implements the gVisor state machine from
57/// NucleusSecurity_GVisor_GVisorRuntime.tla
58pub struct GVisorRuntime {
59    runsc_path: String,
60}
61
62impl GVisorRuntime {
63    /// Create a new GVisor runtime manager
64    ///
65    /// This checks for runsc binary availability
66    pub fn new() -> Result<Self> {
67        let runsc_path = Self::find_runsc()?;
68        info!("Found runsc at: {}", runsc_path);
69        Ok(Self { runsc_path })
70    }
71
72    /// Create a GVisor runtime with a pre-resolved runsc path.
73    ///
74    /// Use this when the path was resolved before privilege changes
75    /// (e.g. before entering a user namespace where UID 0 would block
76    /// PATH-based lookup).
77    pub fn with_path(runsc_path: String) -> Self {
78        Self { runsc_path }
79    }
80
81    /// Resolve the runsc path without constructing a full runtime.
82    /// Call this before fork/unshare so the path is resolved while
83    /// still unprivileged.
84    pub fn resolve_path() -> Result<String> {
85        Self::find_runsc()
86    }
87
88    /// Find the runsc binary
89    fn find_runsc() -> Result<String> {
90        // Try common locations
91        let paths = vec![
92            "/usr/local/bin/runsc",
93            "/usr/bin/runsc",
94            "/opt/gvisor/runsc",
95        ];
96
97        for path in &paths {
98            if let Some(validated) = Self::validate_runsc_path(Path::new(path))? {
99                return Ok(validated);
100            }
101        }
102
103        // For privileged execution, do not resolve runtime binaries via PATH.
104        // This avoids environment-based binary hijacking when running as root.
105        if Uid::effective().is_root() {
106            return Err(NucleusError::GVisorError(
107                "runsc binary not found in trusted system paths".to_string(),
108            ));
109        }
110
111        // Try to find in PATH without invoking a shell command.
112        if let Some(path_var) = std::env::var_os("PATH") {
113            for dir in std::env::split_paths(&path_var) {
114                let candidate = dir.join("runsc");
115                if let Some(validated) = Self::validate_runsc_path(&candidate)? {
116                    return Ok(validated);
117                }
118            }
119        }
120
121        Err(NucleusError::GVisorError(
122            "runsc binary not found. Please install gVisor.".to_string(),
123        ))
124    }
125
126    fn validate_runsc_path(path: &Path) -> Result<Option<String>> {
127        if !path.exists() {
128            return Ok(None);
129        }
130        if !path.is_file() {
131            return Ok(None);
132        }
133
134        let canonical = std::fs::canonicalize(path).map_err(|e| {
135            NucleusError::GVisorError(format!(
136                "Failed to canonicalize runsc path {:?}: {}",
137                path, e
138            ))
139        })?;
140
141        // If the candidate is a shell wrapper script (common on NixOS where
142        // nix wraps binaries to inject PATH), look for the real ELF binary
143        // next to it.  runsc's gofer subprocess re-execs via /proc/self/exe,
144        // which must point to the real binary – not a bash wrapper.
145        let resolved = Self::unwrap_nix_wrapper(&canonical).unwrap_or_else(|| canonical.clone());
146
147        let metadata = std::fs::metadata(&resolved).map_err(|e| {
148            NucleusError::GVisorError(format!("Failed to stat runsc path {:?}: {}", resolved, e))
149        })?;
150
151        let mode = metadata.permissions().mode();
152        if mode & 0o022 != 0 {
153            return Err(NucleusError::GVisorError(format!(
154                "Refusing insecure runsc binary permissions at {:?} (mode {:o})",
155                resolved, mode
156            )));
157        }
158        if mode & 0o111 == 0 {
159            return Ok(None);
160        }
161
162        // Reject binaries owned by other non-root users – a malicious user
163        // could place a trojan runsc earlier in PATH.
164        use std::os::unix::fs::MetadataExt;
165        let owner = metadata.uid();
166        let current_uid = nix::unistd::Uid::effective().as_raw();
167        if !Self::is_trusted_runsc_owner(&resolved, owner, current_uid) {
168            return Err(NucleusError::GVisorError(format!(
169                "Refusing runsc binary at {:?} owned by uid {} (expected root, current user {}, or immutable /nix/store artifact)",
170                resolved, owner, current_uid
171            )));
172        }
173
174        Ok(Some(resolved.to_string_lossy().to_string()))
175    }
176
177    fn is_trusted_runsc_owner(path: &Path, owner: u32, current_uid: u32) -> bool {
178        if owner == 0 || owner == current_uid {
179            return true;
180        }
181
182        // Nix store artifacts are immutable content-addressed paths and are
183        // commonly owned by `nobody` rather than root/current user.
184        // Extra hardening: verify the binary is not writable by *anyone* and
185        // the parent directory is also not writable, to guard against a
186        // compromised or mutable store.
187        if path.starts_with("/nix/store") {
188            if let Ok(meta) = std::fs::metadata(path) {
189                let mode = meta.permissions().mode();
190                // Reject if owner-writable (group/other already checked by caller)
191                if mode & 0o200 != 0 {
192                    return false;
193                }
194            } else {
195                return false;
196            }
197            // Verify the immediate parent directory is not writable
198            if let Some(parent) = path.parent() {
199                if let Ok(parent_meta) = std::fs::metadata(parent) {
200                    let parent_mode = parent_meta.permissions().mode();
201                    if parent_mode & 0o222 != 0 {
202                        return false;
203                    }
204                } else {
205                    return false;
206                }
207            }
208            return true;
209        }
210
211        false
212    }
213
214    /// If `path` is a Nix wrapper script, extract the real binary path.
215    ///
216    /// Nix wrapper scripts end with a line like:
217    ///   exec -a "$0" "/nix/store/…/.runsc-wrapped"  "$@"
218    /// We parse that to find the actual ELF binary.
219    fn unwrap_nix_wrapper(path: &Path) -> Option<std::path::PathBuf> {
220        let content = std::fs::read_to_string(path).ok()?;
221        // Only process short scripts (wrapper scripts are small)
222        if content.len() > 4096 || !content.starts_with("#!") {
223            return None;
224        }
225        // Look for the exec line that references the wrapped binary
226        for line in content.lines().rev() {
227            let trimmed = line.trim();
228            if trimmed.starts_with("exec ") {
229                // Parse: exec -a "$0" "/nix/store/.../bin/.runsc-wrapped"  "$@"
230                // or:    exec "/nix/store/.../bin/.runsc-wrapped"  "$@"
231                for token in trimmed.split_whitespace() {
232                    let unquoted = token.trim_matches('"');
233                    if unquoted.starts_with('/') && unquoted.contains("runsc") {
234                        let candidate = std::path::PathBuf::from(unquoted);
235                        if candidate.exists() && candidate.is_file() {
236                            debug!("Resolved Nix wrapper {:?} → {:?}", path, candidate);
237                            return Some(candidate);
238                        }
239                    }
240                }
241            }
242        }
243        None
244    }
245
246    /// Execute using gVisor with an OCI bundle
247    ///
248    /// This is the OCI-compliant way to run containers with gVisor.
249    /// The `network_mode` parameter controls gVisor's --network flag:
250    /// - `GVisorNetworkMode::None` → `--network none` (fully isolated, original behavior)
251    /// - `GVisorNetworkMode::Sandbox` → `--network sandbox` (gVisor user-space network stack)
252    /// - `GVisorNetworkMode::Host` → `--network host` (share host network namespace)
253    pub fn exec_with_oci_bundle(&self, container_id: &str, bundle: &OciBundle) -> Result<()> {
254        self.exec_with_oci_bundle_network(
255            container_id,
256            bundle,
257            GVisorNetworkMode::None,
258            false,
259            GVisorPlatform::Systrap,
260        )
261    }
262
263    /// Execute using gVisor with an OCI bundle and explicit network mode.
264    ///
265    /// When `rootless` is true, the OCI spec is expected to carry explicit
266    /// user namespace mappings. In that mode we do not pass runsc's CLI
267    /// `--rootless` flag, because gVisor documents that flag as the
268    /// `runsc do`-oriented path rather than the OCI `run` path. We still skip runsc's
269    /// internal cgroup configuration because Nucleus already manages cgroups
270    /// externally and unprivileged callers cannot configure them directly.
271    pub fn exec_with_oci_bundle_network(
272        &self,
273        container_id: &str,
274        bundle: &OciBundle,
275        network_mode: GVisorNetworkMode,
276        rootless: bool,
277        platform: GVisorPlatform,
278    ) -> Result<()> {
279        info!(
280            "Executing with gVisor using OCI bundle at {:?} (network: {:?}, platform: {:?})",
281            bundle.bundle_path(),
282            network_mode,
283            platform,
284        );
285
286        let network_flag = match network_mode {
287            GVisorNetworkMode::None => "none",
288            GVisorNetworkMode::Sandbox => "sandbox",
289            GVisorNetworkMode::Host => "host",
290        };
291
292        // Create a per-container root directory for runsc state.
293        // By default runsc uses /var/run/runsc which requires root privileges.
294        // We place it next to the OCI bundle so it is cleaned up together.
295        let runsc_root = bundle
296            .bundle_path()
297            .parent()
298            .unwrap_or(bundle.bundle_path())
299            .join("runsc-root");
300        std::fs::create_dir_all(&runsc_root).map_err(|e| {
301            NucleusError::GVisorError(format!("Failed to create runsc root directory: {}", e))
302        })?;
303        std::fs::set_permissions(&runsc_root, std::fs::Permissions::from_mode(0o700)).map_err(
304            |e| {
305                NucleusError::GVisorError(format!(
306                    "Failed to secure runsc root directory permissions: {}",
307                    e
308                ))
309            },
310        )?;
311
312        let runsc_runtime_dir = runsc_root.join("runtime");
313        std::fs::create_dir_all(&runsc_runtime_dir).map_err(|e| {
314            NucleusError::GVisorError(format!("Failed to create runsc runtime directory: {}", e))
315        })?;
316        std::fs::set_permissions(&runsc_runtime_dir, std::fs::Permissions::from_mode(0o700))
317            .map_err(|e| {
318                NucleusError::GVisorError(format!(
319                    "Failed to secure runsc runtime directory permissions: {}",
320                    e
321                ))
322            })?;
323
324        // Build runsc command with OCI bundle.
325        // Global flags (--root, --network, --platform) must come BEFORE the subcommand.
326        // runsc --root <dir> --network <mode> --platform <plat> run --bundle <path> <id>
327        let args = self.build_oci_run_args(
328            container_id,
329            bundle,
330            &runsc_root,
331            network_flag,
332            rootless,
333            platform,
334        );
335
336        debug!("runsc OCI args: {:?}", args);
337
338        // Convert to CStrings for exec
339        let program = CString::new(self.runsc_path.as_str())
340            .map_err(|e| NucleusError::GVisorError(format!("Invalid runsc path: {}", e)))?;
341
342        let c_args: Result<Vec<CString>> = args
343            .iter()
344            .map(|arg| {
345                CString::new(arg.as_str())
346                    .map_err(|e| NucleusError::GVisorError(format!("Invalid argument: {}", e)))
347            })
348            .collect();
349        let c_args = c_args?;
350
351        let c_env = self.exec_environment(&runsc_runtime_dir)?;
352
353        // runsc starts its gofer by re-executing /proc/self/exe. Carrying
354        // no_new_privs into runsc makes that helper exec fail with EPERM on
355        // the locked-down NixOS VM profile, so leave gVisor to enforce its own
356        // sandbox process model after exec.
357
358        // execve - this replaces the current process with runsc
359        nix::unistd::execve::<std::ffi::CString, std::ffi::CString>(&program, &c_args, &c_env)?;
360
361        // Should never reach here
362        Ok(())
363    }
364
365    /// Check if gVisor is available on this system
366    pub fn is_available() -> bool {
367        Self::find_runsc().is_ok()
368    }
369
370    /// Get runsc version
371    pub fn version(&self) -> Result<String> {
372        let output = Command::new(&self.runsc_path)
373            .arg("--version")
374            .output()
375            .map_err(|e| NucleusError::GVisorError(format!("Failed to get version: {}", e)))?;
376
377        if !output.status.success() {
378            return Err(NucleusError::GVisorError(
379                "Failed to get runsc version".to_string(),
380            ));
381        }
382
383        let version = String::from_utf8_lossy(&output.stdout).to_string();
384        Ok(version.trim().to_string())
385    }
386
387    fn exec_environment(&self, runtime_dir: &Path) -> Result<Vec<CString>> {
388        let mut env = Vec::new();
389        let mut push = |key: &str, value: String| -> Result<()> {
390            env.push(
391                CString::new(format!("{}={}", key, value))
392                    .map_err(|e| NucleusError::GVisorError(format!("Invalid {}: {}", key, e)))?,
393            );
394            Ok(())
395        };
396
397        // Use a hardcoded PATH for the runsc supervisor process to prevent
398        // host PATH from leaking into the gVisor environment.
399        push(
400            "PATH",
401            "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
402        )?;
403        let runtime_dir = runtime_dir.to_string_lossy().to_string();
404        push("TMPDIR", runtime_dir.clone())?;
405        push("XDG_RUNTIME_DIR", runtime_dir)?;
406
407        // Hardcode safe values instead of leaking host identity/paths.
408        // HOME could point to an attacker-controlled directory; USER/LOGNAME
409        // leak host identity information – none of which gVisor needs.
410        push("HOME", "/root".to_string())?;
411        push("USER", "root".to_string())?;
412        push("LOGNAME", "root".to_string())?;
413
414        Ok(env)
415    }
416
417    fn build_oci_run_args(
418        &self,
419        container_id: &str,
420        bundle: &OciBundle,
421        runsc_root: &Path,
422        network_flag: &str,
423        rootless: bool,
424        platform: GVisorPlatform,
425    ) -> Vec<String> {
426        let mut args = vec![
427            self.runsc_path.clone(),
428            "--root".to_string(),
429            runsc_root.to_string_lossy().to_string(),
430        ];
431
432        // Rootless OCI mode relies on user namespace mappings in config.json.
433        // We intentionally do not pass runsc's CLI `--rootless` flag here.
434        if rootless {
435            args.push("--ignore-cgroups".to_string());
436        }
437
438        args.extend([
439            "--network".to_string(),
440            network_flag.to_string(),
441            "--platform".to_string(),
442            platform.as_flag().to_string(),
443            "run".to_string(),
444            "--bundle".to_string(),
445            bundle.bundle_path().to_string_lossy().to_string(),
446            container_id.to_string(),
447        ]);
448
449        args
450    }
451}
452
453#[cfg(test)]
454mod tests {
455    use super::*;
456    use std::path::Path;
457
458    #[test]
459    fn test_gvisor_availability() {
460        // This test just checks if we can determine availability
461        // It may pass or fail depending on whether gVisor is installed
462        let available = GVisorRuntime::is_available();
463        println!("gVisor available: {}", available);
464    }
465
466    #[test]
467    fn test_gvisor_new() {
468        let runtime = GVisorRuntime::new();
469        if let Ok(rt) = runtime {
470            println!("Found runsc at: {}", rt.runsc_path);
471            if let Ok(version) = rt.version() {
472                println!("runsc version: {}", version);
473            }
474        }
475    }
476
477    #[test]
478    fn test_find_runsc() {
479        // Test that find_runsc either succeeds or returns appropriate error
480        match GVisorRuntime::find_runsc() {
481            Ok(path) => {
482                println!("Found runsc at: {}", path);
483                assert!(!path.is_empty());
484            }
485            Err(e) => {
486                println!("runsc not found (expected if gVisor not installed): {}", e);
487            }
488        }
489    }
490
491    #[test]
492    fn test_validate_runsc_rejects_world_writable() {
493        let dir = tempfile::tempdir().unwrap();
494        let fake_runsc = dir.path().join("runsc");
495        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
496        // Make world-writable
497        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o777)).unwrap();
498
499        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
500        assert!(
501            result.is_err(),
502            "validate_runsc_path must reject world-writable binaries"
503        );
504    }
505
506    #[test]
507    fn test_validate_runsc_rejects_group_writable() {
508        let dir = tempfile::tempdir().unwrap();
509        let fake_runsc = dir.path().join("runsc");
510        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
511        // Make group-writable
512        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o775)).unwrap();
513
514        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
515        assert!(
516            result.is_err(),
517            "validate_runsc_path must reject group-writable binaries"
518        );
519    }
520
521    #[test]
522    fn test_runsc_owner_accepts_nix_store_artifact_owner() {
523        // Use a real Nix store binary so the metadata/permission checks pass.
524        // The /nix/store contents are read-only and content-addressed, so any
525        // existing file with mode 555 works.
526        let nix_binary = std::fs::read_dir("/nix/store")
527            .ok()
528            .and_then(|mut entries| {
529                entries.find_map(|e| {
530                    let dir = e.ok()?.path();
531                    let candidate = dir.join("bin/runsc");
532                    if candidate.exists() {
533                        Some(candidate)
534                    } else {
535                        None
536                    }
537                })
538            });
539
540        let path = match nix_binary {
541            Some(p) => p,
542            None => {
543                eprintln!("skipping: no runsc binary found in /nix/store");
544                return;
545            }
546        };
547
548        assert!(GVisorRuntime::is_trusted_runsc_owner(&path, 65534, 1000));
549    }
550
551    #[test]
552    fn test_exec_environment_uses_hardcoded_path() {
553        // The gVisor supervisor must NOT inherit the host PATH, to prevent
554        // host filesystem layout leaking into the container environment.
555        // Verify by setting a distinctive PATH and checking exec_environment
556        // returns a hardcoded value instead.
557        std::env::set_var("PATH", "/tmp/evil-inject/bin:/opt/attacker/sbin");
558        let rt = GVisorRuntime::with_path("/fake/runsc".to_string());
559        let tmp = tempfile::tempdir().unwrap();
560        let env = rt.exec_environment(tmp.path()).unwrap();
561        let path_entry = env
562            .iter()
563            .find(|e| e.to_str().is_ok_and(|s| s.starts_with("PATH=")))
564            .expect("exec_environment must set PATH");
565        let path_val = path_entry.to_str().unwrap();
566        assert!(
567            !path_val.contains("evil-inject") && !path_val.contains("attacker"),
568            "exec_environment must use hardcoded PATH, not host PATH. Got: {}",
569            path_val
570        );
571        assert_eq!(
572            path_val, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
573            "exec_environment PATH must be the standard hardcoded value"
574        );
575    }
576
577    #[test]
578    fn test_runsc_owner_rejects_untrusted_non_store_owner() {
579        assert!(!GVisorRuntime::is_trusted_runsc_owner(
580            Path::new("/tmp/runsc"),
581            4242,
582            1000
583        ));
584    }
585}