Skip to main content

nucleus/security/
gvisor.rs

1use super::landlock::LandlockManager;
2use crate::error::{NucleusError, Result};
3use crate::oci::OciBundle;
4use nix::unistd::Uid;
5use sha2::{Digest, Sha256};
6use std::ffi::CString;
7use std::fs::{self, DirBuilder, OpenOptions};
8use std::io;
9use std::os::unix::fs::{DirBuilderExt, MetadataExt, OpenOptionsExt, PermissionsExt};
10use std::path::{Component, Path, PathBuf};
11use std::process::Command;
12use tracing::{debug, info, warn};
13
14/// Network mode for gVisor runtime.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum GVisorNetworkMode {
17    /// No networking (fully isolated). Default for agent workloads.
18    None,
19    /// gVisor user-space network stack. Suitable for networked production services
20    /// that need gVisor isolation with network access.
21    Sandbox,
22    /// Share host network namespace. Use with caution.
23    Host,
24}
25
26/// Platform backend for gVisor's Sentry.
27#[derive(
28    Debug,
29    Clone,
30    Copy,
31    PartialEq,
32    Eq,
33    Default,
34    clap::ValueEnum,
35    serde::Serialize,
36    serde::Deserialize,
37)]
38pub enum GVisorPlatform {
39    /// systrap backend, the current default and most broadly compatible option.
40    #[default]
41    Systrap,
42    /// KVM-backed sandboxing for the Sentry itself.
43    Kvm,
44    /// ptrace backend for maximal compatibility where systrap/KVM are unavailable.
45    Ptrace,
46}
47
48impl GVisorPlatform {
49    pub fn as_flag(self) -> &'static str {
50        match self {
51            Self::Systrap => "systrap",
52            Self::Kvm => "kvm",
53            Self::Ptrace => "ptrace",
54        }
55    }
56}
57
58/// Options for running an OCI bundle with gVisor.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub struct GVisorOciRunOptions {
61    /// gVisor networking mode passed to runsc.
62    pub network_mode: GVisorNetworkMode,
63    /// Skip runsc's cgroup setup when Nucleus manages cgroups externally.
64    pub ignore_cgroups: bool,
65    /// Use runsc's rootless execution path for pre-created user namespaces.
66    pub runsc_rootless: bool,
67    /// Fail if the host-side supervisor execute allowlist cannot be installed.
68    pub require_supervisor_exec_policy: bool,
69    /// gVisor Sentry platform backend.
70    pub platform: GVisorPlatform,
71}
72
73impl Default for GVisorOciRunOptions {
74    fn default() -> Self {
75        Self {
76            network_mode: GVisorNetworkMode::None,
77            ignore_cgroups: false,
78            runsc_rootless: false,
79            require_supervisor_exec_policy: false,
80            platform: GVisorPlatform::default(),
81        }
82    }
83}
84
85impl GVisorOciRunOptions {
86    fn network_flag(self) -> &'static str {
87        match self.network_mode {
88            GVisorNetworkMode::None => "none",
89            GVisorNetworkMode::Sandbox => "sandbox",
90            GVisorNetworkMode::Host => "host",
91        }
92    }
93}
94
95/// GVisor runtime manager
96///
97/// Implements the gVisor state machine from
98/// NucleusSecurity_GVisor_GVisorRuntime.tla
99pub struct GVisorRuntime {
100    runsc_path: String,
101}
102
103impl GVisorRuntime {
104    /// Create a new GVisor runtime manager
105    ///
106    /// This checks for runsc binary availability
107    pub fn new() -> Result<Self> {
108        let runsc_path = Self::find_runsc()?;
109        info!("Found runsc at: {}", runsc_path);
110        Ok(Self { runsc_path })
111    }
112
113    /// Create a GVisor runtime with a pre-resolved runsc path.
114    ///
115    /// Use this when the path was resolved before privilege changes
116    /// (e.g. before entering a user namespace where UID 0 would block
117    /// PATH-based lookup).
118    pub fn with_path(runsc_path: String) -> Self {
119        Self { runsc_path }
120    }
121
122    /// Resolve the runsc path without constructing a full runtime.
123    /// Call this before fork/unshare so the path is resolved while
124    /// still unprivileged.
125    pub fn resolve_path() -> Result<String> {
126        Self::find_runsc()
127    }
128
129    /// Find the runsc binary
130    fn find_runsc() -> Result<String> {
131        // Try common locations
132        let paths = vec![
133            "/usr/local/bin/runsc",
134            "/usr/bin/runsc",
135            "/opt/gvisor/runsc",
136        ];
137
138        for path in &paths {
139            if let Some(validated) = Self::validate_runsc_path(Path::new(path))? {
140                return Ok(validated);
141            }
142        }
143
144        // For privileged execution, do not resolve runtime binaries via PATH.
145        // This avoids environment-based binary hijacking when running as root.
146        if Uid::effective().is_root() {
147            return Err(NucleusError::GVisorError(
148                "runsc binary not found in trusted system paths".to_string(),
149            ));
150        }
151
152        // Try to find in PATH without invoking a shell command.
153        if let Some(path_var) = std::env::var_os("PATH") {
154            for dir in std::env::split_paths(&path_var) {
155                let candidate = dir.join("runsc");
156                if let Some(validated) = Self::validate_runsc_path(&candidate)? {
157                    return Ok(validated);
158                }
159            }
160        }
161
162        Err(NucleusError::GVisorError(
163            "runsc binary not found. Please install gVisor.".to_string(),
164        ))
165    }
166
167    fn validate_runsc_path(path: &Path) -> Result<Option<String>> {
168        if !path.exists() {
169            return Ok(None);
170        }
171        if !path.is_file() {
172            return Ok(None);
173        }
174
175        let canonical = std::fs::canonicalize(path).map_err(|e| {
176            NucleusError::GVisorError(format!(
177                "Failed to canonicalize runsc path {:?}: {}",
178                path, e
179            ))
180        })?;
181
182        // If the candidate is a shell wrapper script (common on NixOS where
183        // nix wraps binaries to inject PATH), look for the real ELF binary
184        // next to it.  runsc's gofer subprocess re-execs via /proc/self/exe,
185        // which must point to the real binary – not a bash wrapper.
186        let resolved = Self::unwrap_nix_wrapper(&canonical).unwrap_or_else(|| canonical.clone());
187
188        let metadata = std::fs::metadata(&resolved).map_err(|e| {
189            NucleusError::GVisorError(format!("Failed to stat runsc path {:?}: {}", resolved, e))
190        })?;
191
192        let mode = metadata.permissions().mode();
193        if mode & 0o022 != 0 {
194            return Err(NucleusError::GVisorError(format!(
195                "Refusing insecure runsc binary permissions at {:?} (mode {:o})",
196                resolved, mode
197            )));
198        }
199        if mode & 0o111 == 0 {
200            return Ok(None);
201        }
202
203        // Reject binaries owned by other non-root users – a malicious user
204        // could place a trojan runsc earlier in PATH.
205        use std::os::unix::fs::MetadataExt;
206        let owner = metadata.uid();
207        let current_uid = nix::unistd::Uid::effective().as_raw();
208        if !Self::is_trusted_runsc_owner(&resolved, owner, current_uid) {
209            return Err(NucleusError::GVisorError(format!(
210                "Refusing runsc binary at {:?} owned by uid {} (expected root, current user {}, or immutable /nix/store artifact)",
211                resolved, owner, current_uid
212            )));
213        }
214
215        Ok(Some(resolved.to_string_lossy().to_string()))
216    }
217
218    fn is_trusted_runsc_owner(path: &Path, owner: u32, current_uid: u32) -> bool {
219        if owner == 0 || owner == current_uid {
220            return true;
221        }
222
223        // Nix store artifacts are immutable content-addressed paths and are
224        // commonly owned by `nobody` rather than root/current user.
225        // Extra hardening: verify the binary is not writable by *anyone* and
226        // the parent directory is also not writable, to guard against a
227        // compromised or mutable store.
228        if path.starts_with("/nix/store") {
229            if let Ok(meta) = std::fs::metadata(path) {
230                let mode = meta.permissions().mode();
231                // Reject if owner-writable (group/other already checked by caller)
232                if mode & 0o200 != 0 {
233                    return false;
234                }
235            } else {
236                return false;
237            }
238            // Verify the immediate parent directory is not writable
239            if let Some(parent) = path.parent() {
240                if let Ok(parent_meta) = std::fs::metadata(parent) {
241                    let parent_mode = parent_meta.permissions().mode();
242                    if parent_mode & 0o222 != 0 {
243                        return false;
244                    }
245                } else {
246                    return false;
247                }
248            }
249            return true;
250        }
251
252        false
253    }
254
255    /// If `path` is a Nix wrapper script, extract the real binary path.
256    ///
257    /// Nix wrapper scripts end with a line like:
258    ///   exec -a "$0" "/nix/store/…/.runsc-wrapped"  "$@"
259    /// We parse that to find the actual ELF binary.
260    fn unwrap_nix_wrapper(path: &Path) -> Option<std::path::PathBuf> {
261        let content = std::fs::read_to_string(path).ok()?;
262        // Only process short scripts (wrapper scripts are small)
263        if content.len() > 4096 || !content.starts_with("#!") {
264            return None;
265        }
266        // Look for the exec line that references the wrapped binary
267        for line in content.lines().rev() {
268            let trimmed = line.trim();
269            if trimmed.starts_with("exec ") {
270                // Parse: exec -a "$0" "/nix/store/.../bin/.runsc-wrapped"  "$@"
271                // or:    exec "/nix/store/.../bin/.runsc-wrapped"  "$@"
272                for token in trimmed.split_whitespace() {
273                    let unquoted = token.trim_matches('"');
274                    if unquoted.starts_with('/') && unquoted.contains("runsc") {
275                        let candidate = std::path::PathBuf::from(unquoted);
276                        if candidate.exists() && candidate.is_file() {
277                            debug!("Resolved Nix wrapper {:?} → {:?}", path, candidate);
278                            return Some(candidate);
279                        }
280                    }
281                }
282            }
283        }
284        None
285    }
286
287    /// Execute using gVisor with an OCI bundle
288    ///
289    /// This is the OCI-compliant way to run containers with gVisor using
290    /// default options: no networking, systrap platform, no rootless flag,
291    /// and no internal cgroup setup override.
292    pub fn exec_with_oci_bundle(&self, container_id: &str, bundle: &OciBundle) -> Result<()> {
293        self.exec_with_oci_bundle_options(container_id, bundle, GVisorOciRunOptions::default())
294    }
295
296    /// Execute using gVisor with an OCI bundle and explicit run options.
297    ///
298    /// `ignore_cgroups` skips runsc's internal cgroup configuration because
299    /// Nucleus already manages cgroups externally and unprivileged callers
300    /// cannot configure them directly. `runsc_rootless` selects gVisor's
301    /// built-in rootless execution path for cases where Nucleus already
302    /// entered a mapped user namespace and therefore cannot express the
303    /// namespace setup as an OCI `linux.uidMappings` request.
304    /// `require_supervisor_exec_policy` fail-closes if Nucleus cannot install
305    /// the host-side execute allowlist before handing control to runsc.
306    pub fn exec_with_oci_bundle_options(
307        &self,
308        container_id: &str,
309        bundle: &OciBundle,
310        options: GVisorOciRunOptions,
311    ) -> Result<()> {
312        info!(
313            "Executing with gVisor using OCI bundle at {:?} (network: {:?}, platform: {:?})",
314            bundle.bundle_path(),
315            options.network_mode,
316            options.platform,
317        );
318
319        // Create a per-container root directory for runsc state. Do not derive
320        // this from the OCI bundle parent: --bundle may be operator-provided,
321        // shared, or attacker-writable, while runsc state includes a staged
322        // executable used by the supervisor process.
323        let runsc_root = Self::secure_runsc_root(container_id)?;
324
325        let runsc_runtime_dir = runsc_root.join("runtime");
326        Self::ensure_secure_runsc_dir(&runsc_runtime_dir, "runsc runtime directory")?;
327
328        let (program_path, exec_allow_roots) =
329            self.prepare_supervisor_runsc_program(&runsc_root)?;
330
331        // Build runsc command with OCI bundle.
332        // Global flags (--root, --network, --platform) must come BEFORE the subcommand.
333        // runsc --root <dir> --network <mode> --platform <plat> run --bundle <path> <id>
334        let mut args = self.build_oci_run_args(container_id, bundle, &runsc_root, options);
335        args[0] = program_path.to_string_lossy().to_string();
336
337        debug!("runsc OCI args: {:?}", args);
338
339        // Convert to CStrings for exec
340        let program = CString::new(program_path.to_string_lossy().as_ref())
341            .map_err(|e| NucleusError::GVisorError(format!("Invalid runsc path: {}", e)))?;
342
343        let c_args: Result<Vec<CString>> = args
344            .iter()
345            .map(|arg| {
346                CString::new(arg.as_str())
347                    .map_err(|e| NucleusError::GVisorError(format!("Invalid argument: {}", e)))
348            })
349            .collect();
350        let c_args = c_args?;
351
352        let c_env = self.exec_environment(&runsc_runtime_dir)?;
353
354        // runsc starts its gofer by re-executing /proc/self/exe. Carrying
355        // no_new_privs into runsc makes that helper exec fail with EPERM on
356        // the locked-down NixOS VM profile, so leave gVisor to enforce its own
357        // sandbox process model after exec.
358        //
359        // For the rootless bridge path, Nucleus has already entered a mapped
360        // user namespace. Install an execute-only Landlock allowlist there:
361        // runsc may still re-exec itself, but escaped host-side code cannot
362        // exec arbitrary host binaries such as NixOS setuid wrappers.
363        if options.runsc_rootless {
364            self.apply_supervisor_exec_policy(
365                &exec_allow_roots,
366                options.require_supervisor_exec_policy,
367            )?;
368        }
369
370        // execve - this replaces the current process with runsc
371        nix::unistd::execve::<std::ffi::CString, std::ffi::CString>(&program, &c_args, &c_env)?;
372
373        // Should never reach here
374        Ok(())
375    }
376
377    /// Execute using gVisor with an OCI bundle and explicit network mode.
378    ///
379    /// Prefer [`Self::exec_with_oci_bundle_options`] for new call sites.
380    #[allow(clippy::too_many_arguments)]
381    pub fn exec_with_oci_bundle_network(
382        &self,
383        container_id: &str,
384        bundle: &OciBundle,
385        network_mode: GVisorNetworkMode,
386        ignore_cgroups: bool,
387        runsc_rootless: bool,
388        require_supervisor_exec_policy: bool,
389        platform: GVisorPlatform,
390    ) -> Result<()> {
391        self.exec_with_oci_bundle_options(
392            container_id,
393            bundle,
394            GVisorOciRunOptions {
395                network_mode,
396                ignore_cgroups,
397                runsc_rootless,
398                require_supervisor_exec_policy,
399                platform,
400            },
401        )
402    }
403
404    /// Check if gVisor is available on this system
405    pub fn is_available() -> bool {
406        Self::find_runsc().is_ok()
407    }
408
409    /// Get runsc version
410    pub fn version(&self) -> Result<String> {
411        let output = Command::new(&self.runsc_path)
412            .arg("--version")
413            .output()
414            .map_err(|e| NucleusError::GVisorError(format!("Failed to get version: {}", e)))?;
415
416        if !output.status.success() {
417            return Err(NucleusError::GVisorError(
418                "Failed to get runsc version".to_string(),
419            ));
420        }
421
422        let version = String::from_utf8_lossy(&output.stdout).to_string();
423        Ok(version.trim().to_string())
424    }
425
426    fn exec_environment(&self, runtime_dir: &Path) -> Result<Vec<CString>> {
427        let mut env = Vec::new();
428        let mut push = |key: &str, value: String| -> Result<()> {
429            env.push(
430                CString::new(format!("{}={}", key, value))
431                    .map_err(|e| NucleusError::GVisorError(format!("Invalid {}: {}", key, e)))?,
432            );
433            Ok(())
434        };
435
436        // Use a hardcoded PATH for the runsc supervisor process to prevent
437        // host PATH from leaking into the gVisor environment.
438        push(
439            "PATH",
440            "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
441        )?;
442        let runtime_dir = runtime_dir.to_string_lossy().to_string();
443        push("TMPDIR", runtime_dir.clone())?;
444        push("XDG_RUNTIME_DIR", runtime_dir)?;
445
446        // Hardcode safe values instead of leaking host identity/paths.
447        // HOME could point to an attacker-controlled directory; USER/LOGNAME
448        // leak host identity information – none of which gVisor needs.
449        push("HOME", "/root".to_string())?;
450        push("USER", "root".to_string())?;
451        push("LOGNAME", "root".to_string())?;
452
453        Ok(env)
454    }
455
456    fn prepare_supervisor_runsc_program(
457        &self,
458        runsc_root: &Path,
459    ) -> Result<(PathBuf, Vec<PathBuf>)> {
460        let canonical = fs::canonicalize(&self.runsc_path).map_err(|e| {
461            NucleusError::GVisorError(format!(
462                "Failed to canonicalize runsc path {:?}: {}",
463                self.runsc_path, e
464            ))
465        })?;
466
467        if canonical.starts_with("/nix/store") {
468            return Ok((canonical, vec![PathBuf::from("/nix/store")]));
469        }
470
471        Self::ensure_secure_runsc_dir(runsc_root, "runsc root directory")?;
472        let private_dir = runsc_root.join("exec-allow");
473        Self::ensure_secure_runsc_dir(&private_dir, "private runsc exec directory")?;
474
475        let stage_dir = Self::create_unique_runsc_stage_dir(&private_dir)?;
476        let staged = stage_dir.join("runsc");
477        Self::copy_runsc_nofollow(&canonical, &staged)?;
478
479        Ok((staged, vec![private_dir]))
480    }
481
482    fn secure_runsc_root(container_id: &str) -> Result<PathBuf> {
483        let artifact_base = Self::gvisor_artifact_base()?;
484        let artifact_dir = artifact_base.join(Self::runsc_state_component(container_id));
485
486        if Self::host_root_requires_trusted_runsc_ancestry() {
487            Self::ensure_trusted_host_root_runsc_ancestry(
488                &artifact_base,
489                "gVisor runsc artifact base",
490            )?;
491        }
492
493        Self::ensure_secure_runsc_dir(&artifact_base, "gVisor runsc artifact base")?;
494        Self::ensure_secure_runsc_dir(&artifact_dir, "gVisor runsc artifact directory")?;
495
496        let runsc_root = artifact_dir.join("runsc-root");
497        Self::ensure_secure_runsc_dir(&runsc_root, "runsc root directory")?;
498        Ok(runsc_root)
499    }
500
501    fn gvisor_artifact_base() -> Result<PathBuf> {
502        if let Some(path) =
503            std::env::var_os("NUCLEUS_GVISOR_ARTIFACT_BASE").filter(|path| !path.is_empty())
504        {
505            return Self::absolute_path(Path::new(&path), "gVisor artifact base");
506        }
507
508        if !Uid::effective().is_root() || Self::root_uid_maps_to_unprivileged_host_uid_from_proc() {
509            if let Some(dir) = dirs::runtime_dir() {
510                return Ok(dir.join("nucleus-gvisor"));
511            }
512        }
513
514        if Uid::effective().is_root() {
515            Ok(PathBuf::from("/run/nucleus-gvisor"))
516        } else {
517            Ok(std::env::temp_dir().join(format!("nucleus-gvisor-{}", Uid::effective().as_raw())))
518        }
519    }
520
521    fn absolute_path(path: &Path, label: &str) -> Result<PathBuf> {
522        if path.is_absolute() {
523            return Ok(path.to_path_buf());
524        }
525
526        std::env::current_dir()
527            .map(|cwd| cwd.join(path))
528            .map_err(|e| {
529                NucleusError::GVisorError(format!(
530                    "Failed to resolve current directory for {} {:?}: {}",
531                    label, path, e
532                ))
533            })
534    }
535
536    fn runsc_state_component(container_id: &str) -> String {
537        if container_id.len() == 32 && container_id.chars().all(|c| c.is_ascii_hexdigit()) {
538            return container_id.to_string();
539        }
540
541        let digest = Sha256::digest(container_id.as_bytes());
542        format!("id-{}", hex::encode(&digest[..16]))
543    }
544
545    fn root_uid_maps_to_unprivileged_host_uid_from_proc() -> bool {
546        fs::read_to_string("/proc/self/uid_map")
547            .map(|uid_map| Self::root_uid_maps_to_unprivileged_host_uid(&uid_map))
548            .unwrap_or(false)
549    }
550
551    fn root_uid_maps_to_unprivileged_host_uid(uid_map: &str) -> bool {
552        for line in uid_map.lines() {
553            let mut fields = line.split_whitespace();
554            let Some(namespace_start) = fields.next() else {
555                continue;
556            };
557            let Some(host_start) = fields.next() else {
558                continue;
559            };
560            let Some(length) = fields.next() else {
561                continue;
562            };
563            if fields.next().is_some() {
564                continue;
565            }
566
567            let Ok(namespace_start) = namespace_start.parse::<u64>() else {
568                continue;
569            };
570            let Ok(host_start) = host_start.parse::<u64>() else {
571                continue;
572            };
573            let Ok(length) = length.parse::<u64>() else {
574                continue;
575            };
576
577            if namespace_start == 0 && length > 0 {
578                return host_start != 0;
579            }
580        }
581
582        false
583    }
584
585    fn host_root_requires_trusted_runsc_ancestry() -> bool {
586        Uid::effective().is_root() && !Self::root_uid_maps_to_unprivileged_host_uid_from_proc()
587    }
588
589    fn ensure_trusted_host_root_runsc_ancestry(path: &Path, label: &str) -> Result<()> {
590        let path = Self::absolute_path(path, label)?;
591
592        let mut current = PathBuf::new();
593        for component in path.components() {
594            match component {
595                Component::Prefix(prefix) => current.push(prefix.as_os_str()),
596                Component::RootDir => current.push(component.as_os_str()),
597                Component::CurDir => {}
598                Component::ParentDir => {
599                    return Err(NucleusError::GVisorError(format!(
600                        "{} {:?} contains a parent-directory component",
601                        label, path
602                    )));
603                }
604                Component::Normal(name) => {
605                    current.push(name);
606                    match fs::symlink_metadata(&current) {
607                        Ok(metadata) => Self::ensure_trusted_host_root_runsc_ancestor_component(
608                            &current, metadata, label,
609                        )?,
610                        Err(e) if e.kind() == io::ErrorKind::NotFound => break,
611                        Err(e) => {
612                            return Err(NucleusError::GVisorError(format!(
613                                "Failed to stat {} ancestor {:?}: {}",
614                                label, current, e
615                            )));
616                        }
617                    }
618                }
619            }
620        }
621
622        Ok(())
623    }
624
625    fn ensure_trusted_host_root_runsc_ancestor_component(
626        path: &Path,
627        metadata: fs::Metadata,
628        label: &str,
629    ) -> Result<()> {
630        if metadata.file_type().is_symlink() {
631            return Err(NucleusError::GVisorError(format!(
632                "Refusing symlink {} ancestor {:?}",
633                label, path
634            )));
635        }
636        if !metadata.file_type().is_dir() {
637            return Err(NucleusError::GVisorError(format!(
638                "{} ancestor {:?} is not a directory",
639                label, path
640            )));
641        }
642
643        let owner = metadata.uid();
644        if owner != 0 {
645            return Err(NucleusError::GVisorError(format!(
646                "{} ancestor {:?} is owned by uid {} (expected root)",
647                label, path, owner
648            )));
649        }
650
651        let mode = metadata.permissions().mode();
652        if mode & 0o022 != 0 && mode & 0o1000 == 0 {
653            return Err(NucleusError::GVisorError(format!(
654                "{} ancestor {:?} has unsafe permissions {:o}",
655                label,
656                path,
657                mode & 0o7777
658            )));
659        }
660
661        Ok(())
662    }
663
664    fn ensure_secure_runsc_dir(path: &Path, label: &str) -> Result<()> {
665        if let Some(parent) = path
666            .parent()
667            .filter(|parent| !parent.as_os_str().is_empty())
668        {
669            Self::ensure_trusted_runsc_parent(parent, label)?;
670        }
671
672        let mut created = false;
673        match fs::symlink_metadata(path) {
674            Ok(metadata) if metadata.file_type().is_symlink() => {
675                return Err(NucleusError::GVisorError(format!(
676                    "Refusing symlink {} {:?}",
677                    label, path
678                )));
679            }
680            Ok(metadata) if !metadata.file_type().is_dir() => {
681                return Err(NucleusError::GVisorError(format!(
682                    "{} {:?} is not a directory",
683                    label, path
684                )));
685            }
686            Ok(_) => {}
687            Err(e) if e.kind() == io::ErrorKind::NotFound => {
688                match DirBuilder::new().mode(0o700).create(path) {
689                    Ok(()) => {
690                        created = true;
691                    }
692                    Err(create_err) if create_err.kind() == io::ErrorKind::AlreadyExists => {}
693                    Err(create_err) => {
694                        return Err(NucleusError::GVisorError(format!(
695                            "Failed to create {} {:?}: {}",
696                            label, path, create_err
697                        )));
698                    }
699                }
700            }
701            Err(e) => {
702                return Err(NucleusError::GVisorError(format!(
703                    "Failed to stat {} {:?}: {}",
704                    label, path, e
705                )));
706            }
707        }
708
709        if created {
710            fs::set_permissions(path, fs::Permissions::from_mode(0o700)).map_err(|e| {
711                NucleusError::GVisorError(format!(
712                    "Failed to secure newly-created {} permissions {:?}: {}",
713                    label, path, e
714                ))
715            })?;
716        }
717
718        let dir = OpenOptions::new()
719            .read(true)
720            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC | libc::O_DIRECTORY)
721            .open(path)
722            .map_err(|e| {
723                NucleusError::GVisorError(format!(
724                    "Failed to open {} {:?} without following symlinks: {}",
725                    label, path, e
726                ))
727            })?;
728
729        let metadata = dir.metadata().map_err(|e| {
730            NucleusError::GVisorError(format!("Failed to stat {} {:?}: {}", label, path, e))
731        })?;
732        if !metadata.file_type().is_dir() {
733            return Err(NucleusError::GVisorError(format!(
734                "{} {:?} is not a directory",
735                label, path
736            )));
737        }
738
739        let owner = metadata.uid();
740        let expected = Uid::effective().as_raw();
741        if owner != expected {
742            return Err(NucleusError::GVisorError(format!(
743                "{} {:?} is owned by uid {} (expected {})",
744                label, path, owner, expected
745            )));
746        }
747
748        let mode = metadata.permissions().mode() & 0o777;
749        if mode != 0o700 {
750            dir.set_permissions(fs::Permissions::from_mode(0o700))
751                .map_err(|e| {
752                    NucleusError::GVisorError(format!(
753                        "Failed to secure {} permissions {:?}: {}",
754                        label, path, e
755                    ))
756                })?;
757        }
758
759        Ok(())
760    }
761
762    fn ensure_trusted_runsc_parent(parent: &Path, label: &str) -> Result<()> {
763        let metadata = fs::symlink_metadata(parent).map_err(|e| {
764            NucleusError::GVisorError(format!(
765                "Failed to stat parent for {} {:?}: {}",
766                label, parent, e
767            ))
768        })?;
769        if metadata.file_type().is_symlink() {
770            return Err(NucleusError::GVisorError(format!(
771                "Refusing symlink parent for {} {:?}",
772                label, parent
773            )));
774        }
775        if !metadata.file_type().is_dir() {
776            return Err(NucleusError::GVisorError(format!(
777                "Parent for {} {:?} is not a directory",
778                label, parent
779            )));
780        }
781
782        let owner = metadata.uid();
783        let current = Uid::effective().as_raw();
784        let owner_trusted = owner == current || owner == 0;
785        let mode = metadata.permissions().mode();
786        let unsafe_writable = mode & 0o022 != 0 && mode & 0o1000 == 0;
787        if !owner_trusted || unsafe_writable {
788            return Err(NucleusError::GVisorError(format!(
789                "Parent for {} {:?} is not trusted (owner uid {}, mode {:o})",
790                label,
791                parent,
792                owner,
793                mode & 0o7777
794            )));
795        }
796
797        Ok(())
798    }
799
800    fn create_unique_runsc_stage_dir(private_dir: &Path) -> Result<PathBuf> {
801        let nonce = std::time::SystemTime::now()
802            .duration_since(std::time::UNIX_EPOCH)
803            .map(|duration| duration.as_nanos())
804            .unwrap_or_default();
805
806        for attempt in 0..100u32 {
807            let stage_dir = private_dir.join(format!(
808                "stage-{}-{}-{}",
809                std::process::id(),
810                nonce,
811                attempt
812            ));
813            match DirBuilder::new().mode(0o700).create(&stage_dir) {
814                Ok(()) => {
815                    Self::ensure_secure_runsc_dir(&stage_dir, "runsc stage directory")?;
816                    return Ok(stage_dir);
817                }
818                Err(e) if e.kind() == io::ErrorKind::AlreadyExists => continue,
819                Err(e) => {
820                    return Err(NucleusError::GVisorError(format!(
821                        "Failed to create runsc stage directory {:?}: {}",
822                        stage_dir, e
823                    )));
824                }
825            }
826        }
827
828        Err(NucleusError::GVisorError(format!(
829            "Failed to create unique runsc stage directory under {:?}",
830            private_dir
831        )))
832    }
833
834    fn copy_runsc_nofollow(source: &Path, staged: &Path) -> Result<()> {
835        let mut source_file = OpenOptions::new()
836            .read(true)
837            .custom_flags(libc::O_CLOEXEC)
838            .open(source)
839            .map_err(|e| {
840                NucleusError::GVisorError(format!(
841                    "Failed to open runsc source {:?}: {}",
842                    source, e
843                ))
844            })?;
845
846        let source_meta = source_file.metadata().map_err(|e| {
847            NucleusError::GVisorError(format!("Failed to stat runsc source {:?}: {}", source, e))
848        })?;
849        if !source_meta.file_type().is_file() {
850            return Err(NucleusError::GVisorError(format!(
851                "runsc source {:?} is not a regular file",
852                source
853            )));
854        }
855
856        let mut staged_file = OpenOptions::new()
857            .write(true)
858            .create_new(true)
859            .mode(0o500)
860            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
861            .open(staged)
862            .map_err(|e| {
863                NucleusError::GVisorError(format!(
864                    "Failed to create staged runsc binary {:?}: {}",
865                    staged, e
866                ))
867            })?;
868
869        io::copy(&mut source_file, &mut staged_file).map_err(|e| {
870            NucleusError::GVisorError(format!(
871                "Failed to stage runsc binary from {:?} to {:?}: {}",
872                source, staged, e
873            ))
874        })?;
875        staged_file
876            .set_permissions(fs::Permissions::from_mode(0o500))
877            .map_err(|e| {
878                NucleusError::GVisorError(format!(
879                    "Failed to secure staged runsc binary {:?}: {}",
880                    staged, e
881                ))
882            })?;
883        staged_file.sync_all().map_err(|e| {
884            NucleusError::GVisorError(format!(
885                "Failed to sync staged runsc binary {:?}: {}",
886                staged, e
887            ))
888        })?;
889
890        Ok(())
891    }
892
893    fn apply_supervisor_exec_policy(
894        &self,
895        allowed_roots: &[PathBuf],
896        required: bool,
897    ) -> Result<()> {
898        let mut landlock = LandlockManager::new();
899        let applied = landlock.apply_execute_allowlist_policy(allowed_roots, !required)?;
900        if applied {
901            info!(
902                allowed_roots = ?allowed_roots,
903                "Applied gVisor supervisor execute allowlist"
904            );
905        } else if required {
906            return Err(NucleusError::LandlockError(
907                "Required gVisor supervisor execute allowlist was not applied".to_string(),
908            ));
909        } else {
910            warn!(
911                allowed_roots = ?allowed_roots,
912                "gVisor supervisor execute allowlist unavailable"
913            );
914        }
915        Ok(())
916    }
917
918    fn build_oci_run_args(
919        &self,
920        container_id: &str,
921        bundle: &OciBundle,
922        runsc_root: &Path,
923        options: GVisorOciRunOptions,
924    ) -> Vec<String> {
925        let mut args = vec![
926            self.runsc_path.clone(),
927            "--root".to_string(),
928            runsc_root.to_string_lossy().to_string(),
929        ];
930
931        if options.runsc_rootless {
932            args.push("--rootless".to_string());
933        }
934
935        if options.ignore_cgroups {
936            args.push("--ignore-cgroups".to_string());
937        }
938
939        args.extend([
940            "--network".to_string(),
941            options.network_flag().to_string(),
942            "--platform".to_string(),
943            options.platform.as_flag().to_string(),
944            "run".to_string(),
945            "--bundle".to_string(),
946            bundle.bundle_path().to_string_lossy().to_string(),
947            container_id.to_string(),
948        ]);
949
950        args
951    }
952}
953
954#[cfg(test)]
955mod tests {
956    use super::*;
957    use crate::oci::OciConfig;
958    use std::path::Path;
959    use std::sync::{Mutex, MutexGuard};
960
961    static ENV_LOCK: Mutex<()> = Mutex::new(());
962
963    struct EnvLock {
964        _guard: MutexGuard<'static, ()>,
965    }
966
967    impl EnvLock {
968        fn acquire() -> Self {
969            Self {
970                _guard: ENV_LOCK.lock().unwrap(),
971            }
972        }
973    }
974
975    struct EnvVarGuard {
976        key: &'static str,
977        previous: Option<std::ffi::OsString>,
978    }
979
980    impl EnvVarGuard {
981        fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
982            let previous = std::env::var_os(key);
983            std::env::set_var(key, value);
984            Self { key, previous }
985        }
986
987        fn remove(key: &'static str) -> Self {
988            let previous = std::env::var_os(key);
989            std::env::remove_var(key);
990            Self { key, previous }
991        }
992    }
993
994    impl Drop for EnvVarGuard {
995        fn drop(&mut self) {
996            match &self.previous {
997                Some(value) => std::env::set_var(self.key, value),
998                None => std::env::remove_var(self.key),
999            }
1000        }
1001    }
1002
1003    #[test]
1004    fn test_gvisor_availability() {
1005        // This test just checks if we can determine availability
1006        // It may pass or fail depending on whether gVisor is installed
1007        let available = GVisorRuntime::is_available();
1008        println!("gVisor available: {}", available);
1009    }
1010
1011    #[test]
1012    fn test_gvisor_new() {
1013        let runtime = GVisorRuntime::new();
1014        if let Ok(rt) = runtime {
1015            println!("Found runsc at: {}", rt.runsc_path);
1016            if let Ok(version) = rt.version() {
1017                println!("runsc version: {}", version);
1018            }
1019        }
1020    }
1021
1022    #[test]
1023    fn test_find_runsc() {
1024        // Test that find_runsc either succeeds or returns appropriate error
1025        match GVisorRuntime::find_runsc() {
1026            Ok(path) => {
1027                println!("Found runsc at: {}", path);
1028                assert!(!path.is_empty());
1029            }
1030            Err(e) => {
1031                println!("runsc not found (expected if gVisor not installed): {}", e);
1032            }
1033        }
1034    }
1035
1036    #[test]
1037    fn test_validate_runsc_rejects_world_writable() {
1038        let dir = tempfile::tempdir().unwrap();
1039        let fake_runsc = dir.path().join("runsc");
1040        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
1041        // Make world-writable
1042        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o777)).unwrap();
1043
1044        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
1045        assert!(
1046            result.is_err(),
1047            "validate_runsc_path must reject world-writable binaries"
1048        );
1049    }
1050
1051    #[test]
1052    fn test_validate_runsc_rejects_group_writable() {
1053        let dir = tempfile::tempdir().unwrap();
1054        let fake_runsc = dir.path().join("runsc");
1055        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
1056        // Make group-writable
1057        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o775)).unwrap();
1058
1059        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
1060        assert!(
1061            result.is_err(),
1062            "validate_runsc_path must reject group-writable binaries"
1063        );
1064    }
1065
1066    #[test]
1067    fn test_runsc_owner_accepts_nix_store_artifact_owner() {
1068        // Use a real Nix store binary so the metadata/permission checks pass.
1069        // The /nix/store contents are read-only and content-addressed, so any
1070        // existing file with mode 555 works.
1071        let nix_binary = std::fs::read_dir("/nix/store")
1072            .ok()
1073            .and_then(|mut entries| {
1074                entries.find_map(|e| {
1075                    let dir = e.ok()?.path();
1076                    let candidate = dir.join("bin/runsc");
1077                    if candidate.exists() {
1078                        Some(candidate)
1079                    } else {
1080                        None
1081                    }
1082                })
1083            });
1084
1085        let path = match nix_binary {
1086            Some(p) => p,
1087            None => {
1088                eprintln!("skipping: no runsc binary found in /nix/store");
1089                return;
1090            }
1091        };
1092
1093        assert!(GVisorRuntime::is_trusted_runsc_owner(&path, 65534, 1000));
1094    }
1095
1096    #[test]
1097    fn test_exec_environment_uses_hardcoded_path() {
1098        // The gVisor supervisor must NOT inherit the host PATH, to prevent
1099        // host filesystem layout leaking into the container environment.
1100        // Verify by setting a distinctive PATH and checking exec_environment
1101        // returns a hardcoded value instead.
1102        std::env::set_var("PATH", "/tmp/evil-inject/bin:/opt/attacker/sbin");
1103        let rt = GVisorRuntime::with_path("/fake/runsc".to_string());
1104        let tmp = tempfile::tempdir().unwrap();
1105        let env = rt.exec_environment(tmp.path()).unwrap();
1106        let path_entry = env
1107            .iter()
1108            .find(|e| e.to_str().is_ok_and(|s| s.starts_with("PATH=")))
1109            .expect("exec_environment must set PATH");
1110        let path_val = path_entry.to_str().unwrap();
1111        assert!(
1112            !path_val.contains("evil-inject") && !path_val.contains("attacker"),
1113            "exec_environment must use hardcoded PATH, not host PATH. Got: {}",
1114            path_val
1115        );
1116        assert_eq!(
1117            path_val, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1118            "exec_environment PATH must be the standard hardcoded value"
1119        );
1120    }
1121
1122    #[test]
1123    fn test_precreated_rootless_args_pass_runsc_rootless() {
1124        let rt = GVisorRuntime::with_path("/nix/store/fake-runsc/bin/runsc".to_string());
1125        let tmp = tempfile::tempdir().unwrap();
1126        let bundle = OciBundle::new(
1127            tmp.path().join("bundle"),
1128            OciConfig::new(vec!["/bin/true".to_string()], None),
1129        );
1130
1131        let args = rt.build_oci_run_args(
1132            "container-id",
1133            &bundle,
1134            tmp.path(),
1135            GVisorOciRunOptions {
1136                network_mode: GVisorNetworkMode::Host,
1137                ignore_cgroups: true,
1138                runsc_rootless: true,
1139                require_supervisor_exec_policy: false,
1140                platform: GVisorPlatform::Systrap,
1141            },
1142        );
1143
1144        assert!(args.iter().any(|arg| arg == "--rootless"));
1145        assert!(args.iter().any(|arg| arg == "--ignore-cgroups"));
1146    }
1147
1148    #[test]
1149    fn test_rootless_oci_args_do_not_pass_runsc_rootless() {
1150        let rt = GVisorRuntime::with_path("/nix/store/fake-runsc/bin/runsc".to_string());
1151        let tmp = tempfile::tempdir().unwrap();
1152        let bundle = OciBundle::new(
1153            tmp.path().join("bundle"),
1154            OciConfig::new(vec!["/bin/true".to_string()], None),
1155        );
1156
1157        let args = rt.build_oci_run_args(
1158            "container-id",
1159            &bundle,
1160            tmp.path(),
1161            GVisorOciRunOptions {
1162                network_mode: GVisorNetworkMode::Host,
1163                ignore_cgroups: true,
1164                runsc_rootless: false,
1165                require_supervisor_exec_policy: false,
1166                platform: GVisorPlatform::Systrap,
1167            },
1168        );
1169
1170        assert!(!args.iter().any(|arg| arg == "--rootless"));
1171        assert!(args.iter().any(|arg| arg == "--ignore-cgroups"));
1172    }
1173
1174    #[test]
1175    fn test_non_nix_runsc_is_staged_for_supervisor_exec_policy() {
1176        let tmp = tempfile::tempdir().unwrap();
1177        let fake_runsc = tmp.path().join("runsc-source");
1178        std::fs::write(&fake_runsc, b"fake-runsc").unwrap();
1179        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o500)).unwrap();
1180
1181        let rt = GVisorRuntime::with_path(fake_runsc.to_string_lossy().to_string());
1182        let runsc_root = tmp.path().join("runsc-root");
1183        let (program, allow_roots) = rt.prepare_supervisor_runsc_program(&runsc_root).unwrap();
1184
1185        assert!(program.starts_with(runsc_root.join("exec-allow")));
1186        assert_eq!(allow_roots, vec![runsc_root.join("exec-allow")]);
1187        assert_eq!(std::fs::read(&program).unwrap(), b"fake-runsc");
1188        let mode = std::fs::metadata(&program).unwrap().permissions().mode() & 0o777;
1189        assert_eq!(mode, 0o500);
1190    }
1191
1192    #[test]
1193    fn test_runsc_root_uses_hardened_artifact_dir_not_bundle_parent() {
1194        let _env_lock = EnvLock::acquire();
1195        let tmp = tempfile::tempdir().unwrap();
1196        let artifact_base = tmp.path().join("gvisor-artifacts");
1197        let _artifact_base = EnvVarGuard::set("NUCLEUS_GVISOR_ARTIFACT_BASE", &artifact_base);
1198        let _runtime = EnvVarGuard::remove("XDG_RUNTIME_DIR");
1199
1200        let bundle_parent = tmp.path().join("shared");
1201        std::fs::create_dir_all(&bundle_parent).unwrap();
1202        std::fs::set_permissions(&bundle_parent, std::fs::Permissions::from_mode(0o777)).unwrap();
1203        let bundle = OciBundle::new(
1204            bundle_parent.join("bundle"),
1205            OciConfig::new(vec!["/bin/true".to_string()], None),
1206        );
1207
1208        let runsc_root = GVisorRuntime::secure_runsc_root("container-id").unwrap();
1209
1210        assert!(runsc_root
1211            .starts_with(artifact_base.join(GVisorRuntime::runsc_state_component("container-id"))));
1212        assert!(
1213            !runsc_root.starts_with(bundle.bundle_path().parent().unwrap()),
1214            "runsc root must not be derived from a custom bundle parent"
1215        );
1216    }
1217
1218    #[test]
1219    fn test_runsc_staging_rejects_symlink_exec_allow_dir() {
1220        let tmp = tempfile::tempdir().unwrap();
1221        let fake_runsc = tmp.path().join("runsc-source");
1222        std::fs::write(&fake_runsc, b"fake-runsc").unwrap();
1223        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o500)).unwrap();
1224
1225        let runsc_root = tmp.path().join("runsc-root");
1226        std::fs::create_dir(&runsc_root).unwrap();
1227        std::fs::set_permissions(&runsc_root, std::fs::Permissions::from_mode(0o700)).unwrap();
1228        let victim_dir = tmp.path().join("victim");
1229        std::fs::create_dir(&victim_dir).unwrap();
1230        std::os::unix::fs::symlink(&victim_dir, runsc_root.join("exec-allow")).unwrap();
1231
1232        let rt = GVisorRuntime::with_path(fake_runsc.to_string_lossy().to_string());
1233        let err = rt
1234            .prepare_supervisor_runsc_program(&runsc_root)
1235            .unwrap_err()
1236            .to_string();
1237
1238        assert!(
1239            err.contains("Refusing symlink private runsc exec directory"),
1240            "unexpected error: {}",
1241            err
1242        );
1243        assert!(
1244            !victim_dir.join("runsc").exists(),
1245            "staging must not follow the exec-allow symlink"
1246        );
1247    }
1248
1249    #[test]
1250    fn test_runsc_owner_rejects_untrusted_non_store_owner() {
1251        assert!(!GVisorRuntime::is_trusted_runsc_owner(
1252            Path::new("/tmp/runsc"),
1253            4242,
1254            1000
1255        ));
1256    }
1257}