Skip to main content

nucleus/security/
gvisor.rs

1use super::landlock::LandlockManager;
2use crate::error::{NucleusError, Result};
3use crate::oci::OciBundle;
4use nix::unistd::Uid;
5use sha2::{Digest, Sha256};
6use std::ffi::CString;
7use std::fs::{self, DirBuilder, OpenOptions};
8use std::io;
9use std::os::unix::fs::{DirBuilderExt, MetadataExt, OpenOptionsExt, PermissionsExt};
10use std::path::{Component, Path, PathBuf};
11use std::process::Command;
12use tracing::{debug, info, warn};
13
14#[cfg(test)]
15const NIX_STORE_EXEC_ROOT: &str = "/nix/store";
16
17/// Network mode for gVisor runtime.
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum GVisorNetworkMode {
20    /// No networking (fully isolated). Default for agent workloads.
21    None,
22    /// gVisor user-space network stack. Suitable for networked production services
23    /// that need gVisor isolation with network access.
24    Sandbox,
25    /// Share host network namespace. Use with caution.
26    Host,
27}
28
29/// Platform backend for gVisor's Sentry.
30#[derive(
31    Debug,
32    Clone,
33    Copy,
34    PartialEq,
35    Eq,
36    Default,
37    clap::ValueEnum,
38    serde::Serialize,
39    serde::Deserialize,
40)]
41pub enum GVisorPlatform {
42    /// systrap backend, the current default and most broadly compatible option.
43    #[default]
44    Systrap,
45    /// KVM-backed sandboxing for the Sentry itself.
46    Kvm,
47    /// ptrace backend for maximal compatibility where systrap/KVM are unavailable.
48    Ptrace,
49}
50
51impl GVisorPlatform {
52    pub fn as_flag(self) -> &'static str {
53        match self {
54            Self::Systrap => "systrap",
55            Self::Kvm => "kvm",
56            Self::Ptrace => "ptrace",
57        }
58    }
59}
60
61/// Options for running an OCI bundle with gVisor.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub struct GVisorOciRunOptions {
64    /// gVisor networking mode passed to runsc.
65    pub network_mode: GVisorNetworkMode,
66    /// Skip runsc's cgroup setup when Nucleus manages cgroups externally.
67    pub ignore_cgroups: bool,
68    /// Use runsc's rootless execution path for pre-created user namespaces.
69    pub runsc_rootless: bool,
70    /// Fail if the host-side supervisor execute allowlist cannot be installed.
71    pub require_supervisor_exec_policy: bool,
72    /// gVisor Sentry platform backend.
73    pub platform: GVisorPlatform,
74}
75
76impl Default for GVisorOciRunOptions {
77    fn default() -> Self {
78        Self {
79            network_mode: GVisorNetworkMode::None,
80            ignore_cgroups: false,
81            runsc_rootless: false,
82            require_supervisor_exec_policy: false,
83            platform: GVisorPlatform::default(),
84        }
85    }
86}
87
88impl GVisorOciRunOptions {
89    fn network_flag(self) -> &'static str {
90        match self.network_mode {
91            GVisorNetworkMode::None => "none",
92            GVisorNetworkMode::Sandbox => "sandbox",
93            GVisorNetworkMode::Host => "host",
94        }
95    }
96}
97
98/// GVisor runtime manager
99///
100/// Implements the gVisor state machine from
101/// NucleusSecurity_GVisor_GVisorRuntime.tla
102pub struct GVisorRuntime {
103    runsc_path: String,
104}
105
106impl GVisorRuntime {
107    /// Create a new GVisor runtime manager
108    ///
109    /// This checks for runsc binary availability
110    pub fn new() -> Result<Self> {
111        let runsc_path = Self::find_runsc()?;
112        info!("Found runsc at: {}", runsc_path);
113        Ok(Self { runsc_path })
114    }
115
116    /// Create a GVisor runtime with a pre-resolved runsc path.
117    ///
118    /// Use this when the path was resolved before privilege changes
119    /// (e.g. before entering a user namespace where UID 0 would block
120    /// PATH-based lookup).
121    pub fn with_path(runsc_path: String) -> Self {
122        Self { runsc_path }
123    }
124
125    /// Resolve the runsc path without constructing a full runtime.
126    /// Call this before fork/unshare so the path is resolved while
127    /// still unprivileged.
128    pub fn resolve_path() -> Result<String> {
129        Self::find_runsc()
130    }
131
132    /// Find the runsc binary
133    fn find_runsc() -> Result<String> {
134        // Try common locations
135        let paths = vec![
136            "/usr/local/bin/runsc",
137            "/usr/bin/runsc",
138            "/opt/gvisor/runsc",
139        ];
140
141        for path in &paths {
142            if let Some(validated) = Self::validate_runsc_path(Path::new(path))? {
143                return Ok(validated);
144            }
145        }
146
147        // For privileged execution, do not resolve runtime binaries via PATH.
148        // This avoids environment-based binary hijacking when running as root.
149        if Uid::effective().is_root() {
150            return Err(NucleusError::GVisorError(
151                "runsc binary not found in trusted system paths".to_string(),
152            ));
153        }
154
155        // Try to find in PATH without invoking a shell command.
156        if let Some(path_var) = std::env::var_os("PATH") {
157            for dir in std::env::split_paths(&path_var) {
158                let candidate = dir.join("runsc");
159                if let Some(validated) = Self::validate_runsc_path(&candidate)? {
160                    return Ok(validated);
161                }
162            }
163        }
164
165        Err(NucleusError::GVisorError(
166            "runsc binary not found. Please install gVisor.".to_string(),
167        ))
168    }
169
170    fn validate_runsc_path(path: &Path) -> Result<Option<String>> {
171        if !path.exists() {
172            return Ok(None);
173        }
174        if !path.is_file() {
175            return Ok(None);
176        }
177
178        let canonical = std::fs::canonicalize(path).map_err(|e| {
179            NucleusError::GVisorError(format!(
180                "Failed to canonicalize runsc path {:?}: {}",
181                path, e
182            ))
183        })?;
184
185        // If the candidate is a shell wrapper script (common on NixOS where
186        // nix wraps binaries to inject PATH), look for the real ELF binary
187        // next to it. The gVisor helper re-exec path must stay on the real
188        // binary, not a bash wrapper.
189        let resolved = Self::unwrap_nix_wrapper(&canonical).unwrap_or_else(|| canonical.clone());
190
191        let metadata = std::fs::metadata(&resolved).map_err(|e| {
192            NucleusError::GVisorError(format!("Failed to stat runsc path {:?}: {}", resolved, e))
193        })?;
194
195        let mode = metadata.permissions().mode();
196        if mode & 0o022 != 0 {
197            return Err(NucleusError::GVisorError(format!(
198                "Refusing insecure runsc binary permissions at {:?} (mode {:o})",
199                resolved, mode
200            )));
201        }
202        if mode & 0o111 == 0 {
203            return Ok(None);
204        }
205
206        // Reject binaries owned by other non-root users – a malicious user
207        // could place a trojan runsc earlier in PATH.
208        use std::os::unix::fs::MetadataExt;
209        let owner = metadata.uid();
210        let current_uid = nix::unistd::Uid::effective().as_raw();
211        if !Self::is_trusted_runsc_owner(&resolved, owner, current_uid) {
212            return Err(NucleusError::GVisorError(format!(
213                "Refusing runsc binary at {:?} owned by uid {} (expected root, current user {}, or immutable /nix/store artifact)",
214                resolved, owner, current_uid
215            )));
216        }
217
218        Ok(Some(resolved.to_string_lossy().to_string()))
219    }
220
221    fn is_trusted_runsc_owner(path: &Path, owner: u32, current_uid: u32) -> bool {
222        if owner == 0 || owner == current_uid {
223            return true;
224        }
225
226        // Nix store artifacts are immutable content-addressed paths and are
227        // commonly owned by `nobody` rather than root/current user.
228        // Extra hardening: verify the binary is not writable by *anyone* and
229        // the parent directory is also not writable, to guard against a
230        // compromised or mutable store.
231        if path.starts_with("/nix/store") {
232            if let Ok(meta) = std::fs::metadata(path) {
233                let mode = meta.permissions().mode();
234                // Reject if owner-writable (group/other already checked by caller)
235                if mode & 0o200 != 0 {
236                    return false;
237                }
238            } else {
239                return false;
240            }
241            // Verify the immediate parent directory is not writable
242            if let Some(parent) = path.parent() {
243                if let Ok(parent_meta) = std::fs::metadata(parent) {
244                    let parent_mode = parent_meta.permissions().mode();
245                    if parent_mode & 0o222 != 0 {
246                        return false;
247                    }
248                } else {
249                    return false;
250                }
251            }
252            return true;
253        }
254
255        false
256    }
257
258    /// If `path` is a Nix wrapper script, extract the real binary path.
259    ///
260    /// Nix wrapper scripts end with a line like:
261    ///   exec -a "$0" "/nix/store/…/.runsc-wrapped"  "$@"
262    /// We parse that to find the actual ELF binary.
263    fn unwrap_nix_wrapper(path: &Path) -> Option<std::path::PathBuf> {
264        let content = std::fs::read_to_string(path).ok()?;
265        // Only process short scripts (wrapper scripts are small)
266        if content.len() > 4096 || !content.starts_with("#!") {
267            return None;
268        }
269        // Look for the exec line that references the wrapped binary
270        for line in content.lines().rev() {
271            let trimmed = line.trim();
272            if trimmed.starts_with("exec ") {
273                // Parse: exec -a "$0" "/nix/store/.../bin/.runsc-wrapped"  "$@"
274                // or:    exec "/nix/store/.../bin/.runsc-wrapped"  "$@"
275                for token in trimmed.split_whitespace() {
276                    let unquoted = token.trim_matches('"');
277                    if unquoted.starts_with('/') && unquoted.contains("runsc") {
278                        let candidate = std::path::PathBuf::from(unquoted);
279                        if candidate.exists() && candidate.is_file() {
280                            debug!("Resolved Nix wrapper {:?} → {:?}", path, candidate);
281                            return Some(candidate);
282                        }
283                    }
284                }
285            }
286        }
287        None
288    }
289
290    /// Execute using gVisor with an OCI bundle
291    ///
292    /// This is the OCI-compliant way to run containers with gVisor using
293    /// default options: no networking, systrap platform, no rootless flag,
294    /// and no internal cgroup setup override.
295    pub fn exec_with_oci_bundle(&self, container_id: &str, bundle: &OciBundle) -> Result<()> {
296        self.exec_with_oci_bundle_options(container_id, bundle, GVisorOciRunOptions::default())
297    }
298
299    /// Execute using gVisor with an OCI bundle and explicit run options.
300    ///
301    /// `ignore_cgroups` skips runsc's internal cgroup configuration because
302    /// Nucleus already manages cgroups externally and unprivileged callers
303    /// cannot configure them directly. `runsc_rootless` selects gVisor's
304    /// built-in rootless execution path for cases where Nucleus already
305    /// entered a mapped user namespace and therefore cannot express the
306    /// namespace setup as an OCI `linux.uidMappings` request.
307    /// `require_supervisor_exec_policy` fail-closes if Nucleus cannot install
308    /// the host-side execute allowlist before handing control to runsc.
309    pub fn exec_with_oci_bundle_options(
310        &self,
311        container_id: &str,
312        bundle: &OciBundle,
313        options: GVisorOciRunOptions,
314    ) -> Result<()> {
315        info!(
316            "Executing with gVisor using OCI bundle at {:?} (network: {:?}, platform: {:?})",
317            bundle.bundle_path(),
318            options.network_mode,
319            options.platform,
320        );
321
322        // Create a per-container root directory for runsc state. Do not derive
323        // this from the OCI bundle parent: --bundle may be operator-provided,
324        // shared, or attacker-writable, while runsc state includes a staged
325        // executable used by the supervisor process.
326        let runsc_root = Self::secure_runsc_root(container_id)?;
327
328        let runsc_runtime_dir = runsc_root.join("runtime");
329        Self::ensure_secure_runsc_dir(&runsc_runtime_dir, "runsc runtime directory")?;
330
331        let (program_path, exec_allow_roots) =
332            self.prepare_supervisor_runsc_program(&runsc_root)?;
333
334        // Build runsc command with OCI bundle.
335        // Global flags (--root, --network, --platform) must come BEFORE the subcommand.
336        // runsc --root <dir> --network <mode> --platform <plat> run --bundle <path> <id>
337        let mut args = self.build_oci_run_args(container_id, bundle, &runsc_root, options);
338        args[0] = program_path.to_string_lossy().to_string();
339
340        debug!("runsc OCI args: {:?}", args);
341
342        // Convert to CStrings for exec
343        let program = CString::new(program_path.to_string_lossy().as_ref())
344            .map_err(|e| NucleusError::GVisorError(format!("Invalid runsc path: {}", e)))?;
345
346        let c_args: Result<Vec<CString>> = args
347            .iter()
348            .map(|arg| {
349                CString::new(arg.as_str())
350                    .map_err(|e| NucleusError::GVisorError(format!("Invalid argument: {}", e)))
351            })
352            .collect();
353        let c_args = c_args?;
354
355        let c_env = self.exec_environment(&runsc_runtime_dir)?;
356
357        // For the rootless bridge path, Nucleus has already entered a mapped
358        // user namespace. Install an execute-only Landlock allowlist there:
359        // runsc may still re-exec itself, but escaped host-side code cannot
360        // exec arbitrary host binaries such as NixOS setuid wrappers.
361        if options.runsc_rootless {
362            self.apply_supervisor_exec_policy(
363                &exec_allow_roots,
364                options.require_supervisor_exec_policy,
365            )?;
366        }
367
368        // execve - this replaces the current process with runsc
369        nix::unistd::execve::<std::ffi::CString, std::ffi::CString>(&program, &c_args, &c_env)?;
370
371        // Should never reach here
372        Ok(())
373    }
374
375    /// Execute using gVisor with an OCI bundle and explicit network mode.
376    ///
377    /// Prefer [`Self::exec_with_oci_bundle_options`] for new call sites.
378    #[allow(clippy::too_many_arguments)]
379    pub fn exec_with_oci_bundle_network(
380        &self,
381        container_id: &str,
382        bundle: &OciBundle,
383        network_mode: GVisorNetworkMode,
384        ignore_cgroups: bool,
385        runsc_rootless: bool,
386        require_supervisor_exec_policy: bool,
387        platform: GVisorPlatform,
388    ) -> Result<()> {
389        self.exec_with_oci_bundle_options(
390            container_id,
391            bundle,
392            GVisorOciRunOptions {
393                network_mode,
394                ignore_cgroups,
395                runsc_rootless,
396                require_supervisor_exec_policy,
397                platform,
398            },
399        )
400    }
401
402    /// Check if gVisor is available on this system
403    pub fn is_available() -> bool {
404        Self::find_runsc().is_ok()
405    }
406
407    /// Get runsc version
408    pub fn version(&self) -> Result<String> {
409        let output = Command::new(&self.runsc_path)
410            .arg("--version")
411            .output()
412            .map_err(|e| NucleusError::GVisorError(format!("Failed to get version: {}", e)))?;
413
414        if !output.status.success() {
415            return Err(NucleusError::GVisorError(
416                "Failed to get runsc version".to_string(),
417            ));
418        }
419
420        let version = String::from_utf8_lossy(&output.stdout).to_string();
421        Ok(version.trim().to_string())
422    }
423
424    fn exec_environment(&self, runtime_dir: &Path) -> Result<Vec<CString>> {
425        let mut env = Vec::new();
426        let mut push = |key: &str, value: String| -> Result<()> {
427            env.push(
428                CString::new(format!("{}={}", key, value))
429                    .map_err(|e| NucleusError::GVisorError(format!("Invalid {}: {}", key, e)))?,
430            );
431            Ok(())
432        };
433
434        // Use a hardcoded PATH for the runsc supervisor process to prevent
435        // host PATH from leaking into the gVisor environment.
436        push(
437            "PATH",
438            "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
439        )?;
440        let runtime_dir = runtime_dir.to_string_lossy().to_string();
441        push("TMPDIR", runtime_dir.clone())?;
442        push("XDG_RUNTIME_DIR", runtime_dir)?;
443
444        // Hardcode safe values instead of leaking host identity/paths.
445        // HOME could point to an attacker-controlled directory; USER/LOGNAME
446        // leak host identity information – none of which gVisor needs.
447        push("HOME", "/root".to_string())?;
448        push("USER", "root".to_string())?;
449        push("LOGNAME", "root".to_string())?;
450
451        Ok(env)
452    }
453
454    fn prepare_supervisor_runsc_program(
455        &self,
456        runsc_root: &Path,
457    ) -> Result<(PathBuf, Vec<PathBuf>)> {
458        let canonical = fs::canonicalize(&self.runsc_path).map_err(|e| {
459            NucleusError::GVisorError(format!(
460                "Failed to canonicalize runsc path {:?}: {}",
461                self.runsc_path, e
462            ))
463        })?;
464
465        Self::ensure_secure_runsc_dir(runsc_root, "runsc root directory")?;
466        let private_dir = runsc_root.join("exec-allow");
467        Self::ensure_secure_runsc_dir(&private_dir, "private runsc exec directory")?;
468
469        // Stage every runsc binary, including immutable Nix store artifacts.
470        // gVisor re-execs runsc helpers after Landlock is installed; keeping the
471        // executable under a per-container allowlist avoids granting execute over
472        // the whole store.
473        let stage_dir = Self::create_unique_runsc_stage_dir(&private_dir)?;
474        let staged = stage_dir.join("runsc");
475        Self::copy_runsc_nofollow(&canonical, &staged)?;
476
477        Ok((staged, Self::supervisor_exec_allow_roots(private_dir)))
478    }
479
480    fn supervisor_exec_allow_roots(program_root: PathBuf) -> Vec<PathBuf> {
481        // Do not allow procfs execution here. The packaged runsc is patched to
482        // re-exec helper processes through its real executable path; allowing
483        // /proc would also allow procfs fd magic-link execution attempts that
484        // are outside the supervisor policy's intended executable root.
485        vec![program_root]
486    }
487
488    fn secure_runsc_root(container_id: &str) -> Result<PathBuf> {
489        let artifact_base = Self::gvisor_artifact_base()?;
490        let artifact_dir = artifact_base.join(Self::runsc_state_component(container_id));
491
492        if Self::host_root_requires_trusted_runsc_ancestry() {
493            Self::ensure_trusted_host_root_runsc_ancestry(
494                &artifact_base,
495                "gVisor runsc artifact base",
496            )?;
497        }
498
499        Self::ensure_secure_runsc_dir(&artifact_base, "gVisor runsc artifact base")?;
500        Self::ensure_secure_runsc_dir(&artifact_dir, "gVisor runsc artifact directory")?;
501
502        let runsc_root = artifact_dir.join("runsc-root");
503        Self::ensure_secure_runsc_dir(&runsc_root, "runsc root directory")?;
504        Ok(runsc_root)
505    }
506
507    fn gvisor_artifact_base() -> Result<PathBuf> {
508        if let Some(path) =
509            std::env::var_os("NUCLEUS_GVISOR_ARTIFACT_BASE").filter(|path| !path.is_empty())
510        {
511            return Self::absolute_path(Path::new(&path), "gVisor artifact base");
512        }
513
514        if !Uid::effective().is_root() || Self::root_uid_maps_to_unprivileged_host_uid_from_proc() {
515            if let Some(dir) = dirs::runtime_dir() {
516                return Ok(dir.join("nucleus-gvisor"));
517            }
518        }
519
520        if Uid::effective().is_root() {
521            Ok(PathBuf::from("/run/nucleus-gvisor"))
522        } else {
523            Ok(std::env::temp_dir().join(format!("nucleus-gvisor-{}", Uid::effective().as_raw())))
524        }
525    }
526
527    fn absolute_path(path: &Path, label: &str) -> Result<PathBuf> {
528        if path.is_absolute() {
529            return Ok(path.to_path_buf());
530        }
531
532        std::env::current_dir()
533            .map(|cwd| cwd.join(path))
534            .map_err(|e| {
535                NucleusError::GVisorError(format!(
536                    "Failed to resolve current directory for {} {:?}: {}",
537                    label, path, e
538                ))
539            })
540    }
541
542    fn runsc_state_component(container_id: &str) -> String {
543        if container_id.len() == 32 && container_id.chars().all(|c| c.is_ascii_hexdigit()) {
544            return container_id.to_string();
545        }
546
547        let digest = Sha256::digest(container_id.as_bytes());
548        format!("id-{}", hex::encode(&digest[..16]))
549    }
550
551    fn root_uid_maps_to_unprivileged_host_uid_from_proc() -> bool {
552        fs::read_to_string("/proc/self/uid_map")
553            .map(|uid_map| Self::root_uid_maps_to_unprivileged_host_uid(&uid_map))
554            .unwrap_or(false)
555    }
556
557    fn root_uid_maps_to_unprivileged_host_uid(uid_map: &str) -> bool {
558        for line in uid_map.lines() {
559            let mut fields = line.split_whitespace();
560            let Some(namespace_start) = fields.next() else {
561                continue;
562            };
563            let Some(host_start) = fields.next() else {
564                continue;
565            };
566            let Some(length) = fields.next() else {
567                continue;
568            };
569            if fields.next().is_some() {
570                continue;
571            }
572
573            let Ok(namespace_start) = namespace_start.parse::<u64>() else {
574                continue;
575            };
576            let Ok(host_start) = host_start.parse::<u64>() else {
577                continue;
578            };
579            let Ok(length) = length.parse::<u64>() else {
580                continue;
581            };
582
583            if namespace_start == 0 && length > 0 {
584                return host_start != 0;
585            }
586        }
587
588        false
589    }
590
591    fn host_root_requires_trusted_runsc_ancestry() -> bool {
592        Uid::effective().is_root() && !Self::root_uid_maps_to_unprivileged_host_uid_from_proc()
593    }
594
595    fn ensure_trusted_host_root_runsc_ancestry(path: &Path, label: &str) -> Result<()> {
596        let path = Self::absolute_path(path, label)?;
597
598        let mut current = PathBuf::new();
599        for component in path.components() {
600            match component {
601                Component::Prefix(prefix) => current.push(prefix.as_os_str()),
602                Component::RootDir => current.push(component.as_os_str()),
603                Component::CurDir => {}
604                Component::ParentDir => {
605                    return Err(NucleusError::GVisorError(format!(
606                        "{} {:?} contains a parent-directory component",
607                        label, path
608                    )));
609                }
610                Component::Normal(name) => {
611                    current.push(name);
612                    match fs::symlink_metadata(&current) {
613                        Ok(metadata) => Self::ensure_trusted_host_root_runsc_ancestor_component(
614                            &current, metadata, label,
615                        )?,
616                        Err(e) if e.kind() == io::ErrorKind::NotFound => break,
617                        Err(e) => {
618                            return Err(NucleusError::GVisorError(format!(
619                                "Failed to stat {} ancestor {:?}: {}",
620                                label, current, e
621                            )));
622                        }
623                    }
624                }
625            }
626        }
627
628        Ok(())
629    }
630
631    fn ensure_trusted_host_root_runsc_ancestor_component(
632        path: &Path,
633        metadata: fs::Metadata,
634        label: &str,
635    ) -> Result<()> {
636        if metadata.file_type().is_symlink() {
637            return Err(NucleusError::GVisorError(format!(
638                "Refusing symlink {} ancestor {:?}",
639                label, path
640            )));
641        }
642        if !metadata.file_type().is_dir() {
643            return Err(NucleusError::GVisorError(format!(
644                "{} ancestor {:?} is not a directory",
645                label, path
646            )));
647        }
648
649        let owner = metadata.uid();
650        if owner != 0 {
651            return Err(NucleusError::GVisorError(format!(
652                "{} ancestor {:?} is owned by uid {} (expected root)",
653                label, path, owner
654            )));
655        }
656
657        let mode = metadata.permissions().mode();
658        if mode & 0o022 != 0 && mode & 0o1000 == 0 {
659            return Err(NucleusError::GVisorError(format!(
660                "{} ancestor {:?} has unsafe permissions {:o}",
661                label,
662                path,
663                mode & 0o7777
664            )));
665        }
666
667        Ok(())
668    }
669
670    fn ensure_secure_runsc_dir(path: &Path, label: &str) -> Result<()> {
671        if let Some(parent) = path
672            .parent()
673            .filter(|parent| !parent.as_os_str().is_empty())
674        {
675            Self::ensure_trusted_runsc_parent(parent, label)?;
676        }
677
678        let mut created = false;
679        match fs::symlink_metadata(path) {
680            Ok(metadata) if metadata.file_type().is_symlink() => {
681                return Err(NucleusError::GVisorError(format!(
682                    "Refusing symlink {} {:?}",
683                    label, path
684                )));
685            }
686            Ok(metadata) if !metadata.file_type().is_dir() => {
687                return Err(NucleusError::GVisorError(format!(
688                    "{} {:?} is not a directory",
689                    label, path
690                )));
691            }
692            Ok(_) => {}
693            Err(e) if e.kind() == io::ErrorKind::NotFound => {
694                match DirBuilder::new().mode(0o700).create(path) {
695                    Ok(()) => {
696                        created = true;
697                    }
698                    Err(create_err) if create_err.kind() == io::ErrorKind::AlreadyExists => {}
699                    Err(create_err) => {
700                        return Err(NucleusError::GVisorError(format!(
701                            "Failed to create {} {:?}: {}",
702                            label, path, create_err
703                        )));
704                    }
705                }
706            }
707            Err(e) => {
708                return Err(NucleusError::GVisorError(format!(
709                    "Failed to stat {} {:?}: {}",
710                    label, path, e
711                )));
712            }
713        }
714
715        if created {
716            fs::set_permissions(path, fs::Permissions::from_mode(0o700)).map_err(|e| {
717                NucleusError::GVisorError(format!(
718                    "Failed to secure newly-created {} permissions {:?}: {}",
719                    label, path, e
720                ))
721            })?;
722        }
723
724        let dir = OpenOptions::new()
725            .read(true)
726            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC | libc::O_DIRECTORY)
727            .open(path)
728            .map_err(|e| {
729                NucleusError::GVisorError(format!(
730                    "Failed to open {} {:?} without following symlinks: {}",
731                    label, path, e
732                ))
733            })?;
734
735        let metadata = dir.metadata().map_err(|e| {
736            NucleusError::GVisorError(format!("Failed to stat {} {:?}: {}", label, path, e))
737        })?;
738        if !metadata.file_type().is_dir() {
739            return Err(NucleusError::GVisorError(format!(
740                "{} {:?} is not a directory",
741                label, path
742            )));
743        }
744
745        let owner = metadata.uid();
746        let expected = Uid::effective().as_raw();
747        if owner != expected {
748            return Err(NucleusError::GVisorError(format!(
749                "{} {:?} is owned by uid {} (expected {})",
750                label, path, owner, expected
751            )));
752        }
753
754        let mode = metadata.permissions().mode() & 0o777;
755        if mode != 0o700 {
756            dir.set_permissions(fs::Permissions::from_mode(0o700))
757                .map_err(|e| {
758                    NucleusError::GVisorError(format!(
759                        "Failed to secure {} permissions {:?}: {}",
760                        label, path, e
761                    ))
762                })?;
763        }
764
765        Ok(())
766    }
767
768    fn ensure_trusted_runsc_parent(parent: &Path, label: &str) -> Result<()> {
769        let metadata = fs::symlink_metadata(parent).map_err(|e| {
770            NucleusError::GVisorError(format!(
771                "Failed to stat parent for {} {:?}: {}",
772                label, parent, e
773            ))
774        })?;
775        if metadata.file_type().is_symlink() {
776            return Err(NucleusError::GVisorError(format!(
777                "Refusing symlink parent for {} {:?}",
778                label, parent
779            )));
780        }
781        if !metadata.file_type().is_dir() {
782            return Err(NucleusError::GVisorError(format!(
783                "Parent for {} {:?} is not a directory",
784                label, parent
785            )));
786        }
787
788        let owner = metadata.uid();
789        let current = Uid::effective().as_raw();
790        let owner_trusted = owner == current || owner == 0;
791        let mode = metadata.permissions().mode();
792        let unsafe_writable = mode & 0o022 != 0 && mode & 0o1000 == 0;
793        if !owner_trusted || unsafe_writable {
794            return Err(NucleusError::GVisorError(format!(
795                "Parent for {} {:?} is not trusted (owner uid {}, mode {:o})",
796                label,
797                parent,
798                owner,
799                mode & 0o7777
800            )));
801        }
802
803        Ok(())
804    }
805
806    fn create_unique_runsc_stage_dir(private_dir: &Path) -> Result<PathBuf> {
807        let nonce = std::time::SystemTime::now()
808            .duration_since(std::time::UNIX_EPOCH)
809            .map(|duration| duration.as_nanos())
810            .unwrap_or_default();
811
812        for attempt in 0..100u32 {
813            let stage_dir = private_dir.join(format!(
814                "stage-{}-{}-{}",
815                std::process::id(),
816                nonce,
817                attempt
818            ));
819            match DirBuilder::new().mode(0o700).create(&stage_dir) {
820                Ok(()) => {
821                    Self::ensure_secure_runsc_dir(&stage_dir, "runsc stage directory")?;
822                    return Ok(stage_dir);
823                }
824                Err(e) if e.kind() == io::ErrorKind::AlreadyExists => continue,
825                Err(e) => {
826                    return Err(NucleusError::GVisorError(format!(
827                        "Failed to create runsc stage directory {:?}: {}",
828                        stage_dir, e
829                    )));
830                }
831            }
832        }
833
834        Err(NucleusError::GVisorError(format!(
835            "Failed to create unique runsc stage directory under {:?}",
836            private_dir
837        )))
838    }
839
840    fn copy_runsc_nofollow(source: &Path, staged: &Path) -> Result<()> {
841        let mut source_file = OpenOptions::new()
842            .read(true)
843            .custom_flags(libc::O_CLOEXEC)
844            .open(source)
845            .map_err(|e| {
846                NucleusError::GVisorError(format!(
847                    "Failed to open runsc source {:?}: {}",
848                    source, e
849                ))
850            })?;
851
852        let source_meta = source_file.metadata().map_err(|e| {
853            NucleusError::GVisorError(format!("Failed to stat runsc source {:?}: {}", source, e))
854        })?;
855        if !source_meta.file_type().is_file() {
856            return Err(NucleusError::GVisorError(format!(
857                "runsc source {:?} is not a regular file",
858                source
859            )));
860        }
861
862        let mut staged_file = OpenOptions::new()
863            .write(true)
864            .create_new(true)
865            .mode(0o500)
866            .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
867            .open(staged)
868            .map_err(|e| {
869                NucleusError::GVisorError(format!(
870                    "Failed to create staged runsc binary {:?}: {}",
871                    staged, e
872                ))
873            })?;
874
875        io::copy(&mut source_file, &mut staged_file).map_err(|e| {
876            NucleusError::GVisorError(format!(
877                "Failed to stage runsc binary from {:?} to {:?}: {}",
878                source, staged, e
879            ))
880        })?;
881        staged_file
882            .set_permissions(fs::Permissions::from_mode(0o500))
883            .map_err(|e| {
884                NucleusError::GVisorError(format!(
885                    "Failed to secure staged runsc binary {:?}: {}",
886                    staged, e
887                ))
888            })?;
889        staged_file.sync_all().map_err(|e| {
890            NucleusError::GVisorError(format!(
891                "Failed to sync staged runsc binary {:?}: {}",
892                staged, e
893            ))
894        })?;
895
896        Ok(())
897    }
898
899    fn apply_supervisor_exec_policy(
900        &self,
901        allowed_roots: &[PathBuf],
902        required: bool,
903    ) -> Result<()> {
904        let mut landlock = LandlockManager::new();
905        let applied = landlock.apply_execute_allowlist_policy(allowed_roots, !required)?;
906        if applied {
907            info!(
908                allowed_roots = ?allowed_roots,
909                "Applied gVisor supervisor execute allowlist"
910            );
911        } else if required {
912            return Err(NucleusError::LandlockError(
913                "Required gVisor supervisor execute allowlist was not applied".to_string(),
914            ));
915        } else {
916            warn!(
917                allowed_roots = ?allowed_roots,
918                "gVisor supervisor execute allowlist unavailable"
919            );
920        }
921        Ok(())
922    }
923
924    fn build_oci_run_args(
925        &self,
926        container_id: &str,
927        bundle: &OciBundle,
928        runsc_root: &Path,
929        options: GVisorOciRunOptions,
930    ) -> Vec<String> {
931        let mut args = vec![
932            self.runsc_path.clone(),
933            "--root".to_string(),
934            runsc_root.to_string_lossy().to_string(),
935        ];
936
937        if options.runsc_rootless {
938            args.push("--rootless".to_string());
939        }
940
941        if options.ignore_cgroups {
942            args.push("--ignore-cgroups".to_string());
943        }
944
945        args.extend([
946            "--network".to_string(),
947            options.network_flag().to_string(),
948            "--platform".to_string(),
949            options.platform.as_flag().to_string(),
950            "run".to_string(),
951            "--bundle".to_string(),
952            bundle.bundle_path().to_string_lossy().to_string(),
953            container_id.to_string(),
954        ]);
955
956        args
957    }
958}
959
960#[cfg(test)]
961mod tests {
962    use super::*;
963    use crate::oci::OciConfig;
964    use std::path::{Path, PathBuf};
965    use std::sync::{Mutex, MutexGuard};
966
967    static ENV_LOCK: Mutex<()> = Mutex::new(());
968
969    struct EnvLock {
970        _guard: MutexGuard<'static, ()>,
971    }
972
973    impl EnvLock {
974        fn acquire() -> Self {
975            Self {
976                _guard: ENV_LOCK.lock().unwrap(),
977            }
978        }
979    }
980
981    struct EnvVarGuard {
982        key: &'static str,
983        previous: Option<std::ffi::OsString>,
984    }
985
986    impl EnvVarGuard {
987        fn set(key: &'static str, value: impl AsRef<std::ffi::OsStr>) -> Self {
988            let previous = std::env::var_os(key);
989            std::env::set_var(key, value);
990            Self { key, previous }
991        }
992
993        fn remove(key: &'static str) -> Self {
994            let previous = std::env::var_os(key);
995            std::env::remove_var(key);
996            Self { key, previous }
997        }
998    }
999
1000    impl Drop for EnvVarGuard {
1001        fn drop(&mut self) {
1002            match &self.previous {
1003                Some(value) => std::env::set_var(self.key, value),
1004                None => std::env::remove_var(self.key),
1005            }
1006        }
1007    }
1008
1009    #[test]
1010    fn test_gvisor_availability() {
1011        // This test just checks if we can determine availability
1012        // It may pass or fail depending on whether gVisor is installed
1013        let available = GVisorRuntime::is_available();
1014        println!("gVisor available: {}", available);
1015    }
1016
1017    #[test]
1018    fn test_gvisor_new() {
1019        let runtime = GVisorRuntime::new();
1020        if let Ok(rt) = runtime {
1021            println!("Found runsc at: {}", rt.runsc_path);
1022            if let Ok(version) = rt.version() {
1023                println!("runsc version: {}", version);
1024            }
1025        }
1026    }
1027
1028    #[test]
1029    fn test_find_runsc() {
1030        // Test that find_runsc either succeeds or returns appropriate error
1031        match GVisorRuntime::find_runsc() {
1032            Ok(path) => {
1033                println!("Found runsc at: {}", path);
1034                assert!(!path.is_empty());
1035            }
1036            Err(e) => {
1037                println!("runsc not found (expected if gVisor not installed): {}", e);
1038            }
1039        }
1040    }
1041
1042    #[test]
1043    fn test_validate_runsc_rejects_world_writable() {
1044        let dir = tempfile::tempdir().unwrap();
1045        let fake_runsc = dir.path().join("runsc");
1046        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
1047        // Make world-writable
1048        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o777)).unwrap();
1049
1050        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
1051        assert!(
1052            result.is_err(),
1053            "validate_runsc_path must reject world-writable binaries"
1054        );
1055    }
1056
1057    #[test]
1058    fn test_validate_runsc_rejects_group_writable() {
1059        let dir = tempfile::tempdir().unwrap();
1060        let fake_runsc = dir.path().join("runsc");
1061        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
1062        // Make group-writable
1063        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o775)).unwrap();
1064
1065        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
1066        assert!(
1067            result.is_err(),
1068            "validate_runsc_path must reject group-writable binaries"
1069        );
1070    }
1071
1072    #[test]
1073    fn test_runsc_owner_accepts_nix_store_artifact_owner() {
1074        // Use a real Nix store binary so the metadata/permission checks pass.
1075        // The /nix/store contents are read-only and content-addressed, so any
1076        // existing file with mode 555 works.
1077        let nix_binary = std::fs::read_dir("/nix/store")
1078            .ok()
1079            .and_then(|mut entries| {
1080                entries.find_map(|e| {
1081                    let dir = e.ok()?.path();
1082                    let candidate = dir.join("bin/runsc");
1083                    if candidate.exists() {
1084                        Some(candidate)
1085                    } else {
1086                        None
1087                    }
1088                })
1089            });
1090
1091        let path = match nix_binary {
1092            Some(p) => p,
1093            None => {
1094                eprintln!("skipping: no runsc binary found in /nix/store");
1095                return;
1096            }
1097        };
1098
1099        assert!(GVisorRuntime::is_trusted_runsc_owner(&path, 65534, 1000));
1100    }
1101
1102    #[test]
1103    fn test_exec_environment_uses_hardcoded_path() {
1104        // The gVisor supervisor must NOT inherit the host PATH, to prevent
1105        // host filesystem layout leaking into the container environment.
1106        // Verify by setting a distinctive PATH and checking exec_environment
1107        // returns a hardcoded value instead.
1108        std::env::set_var("PATH", "/tmp/evil-inject/bin:/opt/attacker/sbin");
1109        let rt = GVisorRuntime::with_path("/fake/runsc".to_string());
1110        let tmp = tempfile::tempdir().unwrap();
1111        let env = rt.exec_environment(tmp.path()).unwrap();
1112        let path_entry = env
1113            .iter()
1114            .find(|e| e.to_str().is_ok_and(|s| s.starts_with("PATH=")))
1115            .expect("exec_environment must set PATH");
1116        let path_val = path_entry.to_str().unwrap();
1117        assert!(
1118            !path_val.contains("evil-inject") && !path_val.contains("attacker"),
1119            "exec_environment must use hardcoded PATH, not host PATH. Got: {}",
1120            path_val
1121        );
1122        assert_eq!(
1123            path_val, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1124            "exec_environment PATH must be the standard hardcoded value"
1125        );
1126    }
1127
1128    #[test]
1129    fn test_precreated_rootless_args_pass_runsc_rootless() {
1130        let rt = GVisorRuntime::with_path("/nix/store/fake-runsc/bin/runsc".to_string());
1131        let tmp = tempfile::tempdir().unwrap();
1132        let bundle = OciBundle::new(
1133            tmp.path().join("bundle"),
1134            OciConfig::new(vec!["/bin/true".to_string()], None),
1135        );
1136
1137        let args = rt.build_oci_run_args(
1138            "container-id",
1139            &bundle,
1140            tmp.path(),
1141            GVisorOciRunOptions {
1142                network_mode: GVisorNetworkMode::Host,
1143                ignore_cgroups: true,
1144                runsc_rootless: true,
1145                require_supervisor_exec_policy: false,
1146                platform: GVisorPlatform::Systrap,
1147            },
1148        );
1149
1150        assert!(args.iter().any(|arg| arg == "--rootless"));
1151        assert!(args.iter().any(|arg| arg == "--ignore-cgroups"));
1152    }
1153
1154    #[test]
1155    fn test_rootless_oci_args_do_not_pass_runsc_rootless() {
1156        let rt = GVisorRuntime::with_path("/nix/store/fake-runsc/bin/runsc".to_string());
1157        let tmp = tempfile::tempdir().unwrap();
1158        let bundle = OciBundle::new(
1159            tmp.path().join("bundle"),
1160            OciConfig::new(vec!["/bin/true".to_string()], None),
1161        );
1162
1163        let args = rt.build_oci_run_args(
1164            "container-id",
1165            &bundle,
1166            tmp.path(),
1167            GVisorOciRunOptions {
1168                network_mode: GVisorNetworkMode::Host,
1169                ignore_cgroups: true,
1170                runsc_rootless: false,
1171                require_supervisor_exec_policy: false,
1172                platform: GVisorPlatform::Systrap,
1173            },
1174        );
1175
1176        assert!(!args.iter().any(|arg| arg == "--rootless"));
1177        assert!(args.iter().any(|arg| arg == "--ignore-cgroups"));
1178    }
1179
1180    #[test]
1181    fn test_non_nix_runsc_is_staged_for_supervisor_exec_policy() {
1182        let tmp = tempfile::tempdir().unwrap();
1183        let fake_runsc = tmp.path().join("runsc-source");
1184        std::fs::write(&fake_runsc, b"fake-runsc").unwrap();
1185        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o500)).unwrap();
1186
1187        let rt = GVisorRuntime::with_path(fake_runsc.to_string_lossy().to_string());
1188        let runsc_root = tmp.path().join("runsc-root");
1189        let (program, allow_roots) = rt.prepare_supervisor_runsc_program(&runsc_root).unwrap();
1190
1191        assert!(program.starts_with(runsc_root.join("exec-allow")));
1192        assert_eq!(allow_roots, vec![runsc_root.join("exec-allow")]);
1193        assert_eq!(std::fs::read(&program).unwrap(), b"fake-runsc");
1194        let mode = std::fs::metadata(&program).unwrap().permissions().mode() & 0o777;
1195        assert_eq!(mode, 0o500);
1196    }
1197
1198    #[test]
1199    fn test_supervisor_exec_allow_roots_do_not_include_procfs() {
1200        let roots = GVisorRuntime::supervisor_exec_allow_roots(PathBuf::from(NIX_STORE_EXEC_ROOT));
1201
1202        assert_eq!(roots, vec![PathBuf::from(NIX_STORE_EXEC_ROOT)]);
1203        assert!(
1204            !roots.iter().any(|root| root == Path::new("/proc")),
1205            "the supervisor policy must not allow recursive procfs execution"
1206        );
1207    }
1208
1209    #[test]
1210    fn test_runsc_root_uses_hardened_artifact_dir_not_bundle_parent() {
1211        let _env_lock = EnvLock::acquire();
1212        let tmp = tempfile::tempdir().unwrap();
1213        let artifact_base = tmp.path().join("gvisor-artifacts");
1214        let _artifact_base = EnvVarGuard::set("NUCLEUS_GVISOR_ARTIFACT_BASE", &artifact_base);
1215        let _runtime = EnvVarGuard::remove("XDG_RUNTIME_DIR");
1216
1217        let bundle_parent = tmp.path().join("shared");
1218        std::fs::create_dir_all(&bundle_parent).unwrap();
1219        std::fs::set_permissions(&bundle_parent, std::fs::Permissions::from_mode(0o777)).unwrap();
1220        let bundle = OciBundle::new(
1221            bundle_parent.join("bundle"),
1222            OciConfig::new(vec!["/bin/true".to_string()], None),
1223        );
1224
1225        let runsc_root = GVisorRuntime::secure_runsc_root("container-id").unwrap();
1226
1227        assert!(runsc_root
1228            .starts_with(artifact_base.join(GVisorRuntime::runsc_state_component("container-id"))));
1229        assert!(
1230            !runsc_root.starts_with(bundle.bundle_path().parent().unwrap()),
1231            "runsc root must not be derived from a custom bundle parent"
1232        );
1233    }
1234
1235    #[test]
1236    fn test_runsc_staging_rejects_symlink_exec_allow_dir() {
1237        let tmp = tempfile::tempdir().unwrap();
1238        let fake_runsc = tmp.path().join("runsc-source");
1239        std::fs::write(&fake_runsc, b"fake-runsc").unwrap();
1240        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o500)).unwrap();
1241
1242        let runsc_root = tmp.path().join("runsc-root");
1243        std::fs::create_dir(&runsc_root).unwrap();
1244        std::fs::set_permissions(&runsc_root, std::fs::Permissions::from_mode(0o700)).unwrap();
1245        let victim_dir = tmp.path().join("victim");
1246        std::fs::create_dir(&victim_dir).unwrap();
1247        std::os::unix::fs::symlink(&victim_dir, runsc_root.join("exec-allow")).unwrap();
1248
1249        let rt = GVisorRuntime::with_path(fake_runsc.to_string_lossy().to_string());
1250        let err = rt
1251            .prepare_supervisor_runsc_program(&runsc_root)
1252            .unwrap_err()
1253            .to_string();
1254
1255        assert!(
1256            err.contains("Refusing symlink private runsc exec directory"),
1257            "unexpected error: {}",
1258            err
1259        );
1260        assert!(
1261            !victim_dir.join("runsc").exists(),
1262            "staging must not follow the exec-allow symlink"
1263        );
1264    }
1265
1266    #[test]
1267    fn test_runsc_owner_rejects_untrusted_non_store_owner() {
1268        assert!(!GVisorRuntime::is_trusted_runsc_owner(
1269            Path::new("/tmp/runsc"),
1270            4242,
1271            1000
1272        ));
1273    }
1274}