1#![allow(
2 clippy::similar_names,
3 clippy::match_same_arms,
4 clippy::option_if_let_else,
5 clippy::map_unwrap_or,
6 clippy::manual_unwrap_or_default
7)]
8use std::os::fd::OwnedFd;
34use std::path::{Path, PathBuf};
35use std::sync::Arc;
36
37use oci_spec::runtime::{
38 Arch, Capability, LinuxBuilder, LinuxCapabilitiesBuilder, LinuxIdMappingBuilder,
39 LinuxNamespaceBuilder, LinuxNamespaceType, LinuxSeccompAction, LinuxSeccompBuilder,
40 LinuxSyscallBuilder, Mount, MountBuilder, ProcessBuilder, RootBuilder, Spec, SpecBuilder,
41 UserBuilder,
42};
43use serde::{Deserialize, Serialize};
44use tokio::process::Command;
45use tracing::{debug, warn};
46
47use crate::SandboxError;
48use crate::output::{CapturedOutput, OutputMode, ProcessCapture};
49
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54#[non_exhaustive]
55pub enum CloneFlag {
56 NewPid,
58 NewUts,
60 NewIpc,
62 NewNs,
64 NewCgroup,
66 NewNet,
68 NewUser,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct BindMount {
76 pub source: String,
78 pub target: String,
80 pub read_only: bool,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct ContainerSecurity {
87 pub seccomp: ContainerSeccomp,
89 pub capabilities_drop: Vec<String>,
91 pub capabilities_add: Vec<String>,
93 pub no_new_privileges: bool,
95 pub run_as_user: Option<u32>,
97 pub run_as_group: Option<u32>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103#[non_exhaustive]
104pub enum ContainerSeccomp {
105 Unconfined,
107 RuntimeDefault,
109 Localhost {
111 path: String,
113 },
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct ContainerConfig {
122 pub clone_flags: Vec<CloneFlag>,
124 pub network_isolation: bool,
126 pub user_namespace: bool,
128 pub cgroup_namespace: bool,
130 pub bind_mounts: Vec<BindMount>,
132 pub cgroup_path: Option<String>,
134 pub security: ContainerSecurity,
136 pub command: String,
138 pub args: Vec<String>,
140 pub env: std::collections::HashMap<String, String>,
143}
144
145#[derive(Debug)]
156pub struct PtySession {
157 pub controller: OwnedFd,
159 pub child: tokio::process::Child,
161 _bundle: tempfile::TempDir,
163}
164
165#[derive(Debug)]
172pub struct ContainerProcess {
173 pub child: tokio::process::Child,
175 _bundle: tempfile::TempDir,
177}
178
179#[derive(Debug, Clone, Copy, PartialEq, Eq)]
183#[non_exhaustive]
184pub enum OciRuntime {
185 Runc,
190 Gvisor,
196}
197
198#[derive(Debug, Clone, Copy, PartialEq, Eq)]
200#[non_exhaustive]
201pub enum GvisorPlatform {
202 Systrap,
206 Ptrace,
209}
210
211static GVISOR_PLATFORM_CACHE: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
216
217const _PLATFORM_NOT_PROBED: u8 = 0;
218const PLATFORM_SYSTRAP: u8 = 1;
219const PLATFORM_PTRACE: u8 = 2;
220
221#[derive(Debug)]
234pub struct NamespaceContainer {
235 runtime_path: PathBuf,
237 runtime_kind: OciRuntime,
239 gvisor_platform: GvisorPlatform,
241}
242
243impl NamespaceContainer {
244 pub fn new() -> Result<Self, SandboxError> {
250 Self::with_runtime(OciRuntime::Runc)
251 }
252
253 pub fn with_gvisor() -> Result<Self, SandboxError> {
265 Self::with_runtime(OciRuntime::Gvisor)
266 }
267
268 pub fn with_runtime(kind: OciRuntime) -> Result<Self, SandboxError> {
275 let name = match kind {
276 OciRuntime::Runc => "runc",
277 OciRuntime::Gvisor => "runsc",
278 };
279 let path =
280 which_binary(name).map_err(|()| SandboxError::RuntimeNotFound { name: name.into() })?;
281 debug!(runtime = name, path = %path.display(), "found OCI runtime");
282
283 let gvisor_platform = if kind == OciRuntime::Gvisor {
284 resolve_gvisor_platform(&path)
285 } else {
286 GvisorPlatform::Systrap };
288
289 Ok(Self {
290 runtime_path: path,
291 runtime_kind: kind,
292 gvisor_platform,
293 })
294 }
295
296 pub fn spawn(&self, config: &ContainerConfig) -> Result<ContainerProcess, SandboxError> {
306 let (bundle, container_id) = self.prepare_bundle(config, false)?;
307
308 debug!(runtime = %self.runtime_path.display(), %container_id, "spawning namespace container");
309
310 let child = self
311 .build_run_command(&bundle, &container_id, None)
312 .stdin(std::process::Stdio::null())
313 .stdout(std::process::Stdio::piped())
314 .stderr(std::process::Stdio::piped())
315 .kill_on_drop(true)
316 .spawn()
317 .map_err(|e| SandboxError::RuntimeFailed {
318 reason: format!("spawn failed: {e}"),
319 })?;
320
321 Ok(ContainerProcess {
322 child,
323 _bundle: bundle,
324 })
325 }
326
327 pub fn spawn_captured(
339 &self,
340 config: &ContainerConfig,
341 mode: OutputMode,
342 ) -> Result<ProcessCapture, SandboxError> {
343 let output = CapturedOutput::new(mode).map_err(|e| SandboxError::RuntimeFailed {
344 reason: format!("create capture directory: {e}"),
345 })?;
346
347 let stdout_file = std::fs::OpenOptions::new()
348 .create(true)
349 .truncate(true)
350 .write(true)
351 .open(output.stdout_path())
352 .map_err(|e| SandboxError::RuntimeFailed {
353 reason: format!("open stdout capture file: {e}"),
354 })?;
355
356 let stderr_file = match mode {
357 OutputMode::Combined => {
358 stdout_file
359 .try_clone()
360 .map_err(|e| SandboxError::RuntimeFailed {
361 reason: format!("clone stdout handle for combined stderr: {e}"),
362 })?
363 }
364 OutputMode::Separate => {
365 let stderr_path =
366 output
367 .stderr_path()
368 .ok_or_else(|| SandboxError::RuntimeFailed {
369 reason: "separate mode missing stderr path".into(),
370 })?;
371 std::fs::OpenOptions::new()
372 .create(true)
373 .truncate(true)
374 .write(true)
375 .open(stderr_path)
376 .map_err(|e| SandboxError::RuntimeFailed {
377 reason: format!("open stderr capture file: {e}"),
378 })?
379 }
380 };
381
382 let (bundle, container_id) = self.prepare_bundle(config, false)?;
383
384 debug!(runtime = %self.runtime_path.display(), %container_id, "spawning captured namespace container");
385
386 let child = self
387 .build_run_command(&bundle, &container_id, None)
388 .stdin(std::process::Stdio::null())
389 .stdout(std::process::Stdio::from(stdout_file))
390 .stderr(std::process::Stdio::from(stderr_file))
391 .kill_on_drop(true)
392 .spawn()
393 .map_err(|e| SandboxError::RuntimeFailed {
394 reason: format!("spawn failed: {e}"),
395 })?;
396
397 Ok(ProcessCapture {
398 output: Arc::new(output),
399 child,
400 _bundle: Some(bundle),
401 })
402 }
403
404 pub fn spawn_interactive(&self, config: &ContainerConfig) -> Result<PtySession, SandboxError> {
417 let (bundle, container_id) = self.prepare_bundle(config, true)?;
418 let socket_path = bundle.path().join("console.sock");
419
420 let listener = std::os::unix::net::UnixListener::bind(&socket_path).map_err(|e| {
421 SandboxError::RuntimeFailed {
422 reason: format!("bind console socket: {e}"),
423 }
424 })?;
425
426 debug!(runtime = %self.runtime_path.display(), %container_id, "spawning interactive namespace container");
427
428 let child = self
429 .build_run_command(&bundle, &container_id, Some(&socket_path))
430 .stdin(std::process::Stdio::null())
431 .stdout(std::process::Stdio::null())
432 .stderr(std::process::Stdio::piped())
433 .kill_on_drop(true)
434 .spawn()
435 .map_err(|e| SandboxError::RuntimeFailed {
436 reason: format!("spawn failed: {e}"),
437 })?;
438
439 let (stream, _) = listener.accept().map_err(|e| SandboxError::RuntimeFailed {
442 reason: format!("accept console socket: {e}"),
443 })?;
444
445 let controller = recv_pty_controller(&stream)?;
446
447 Ok(PtySession {
448 controller,
449 child,
450 _bundle: bundle,
451 })
452 }
453
454 #[must_use]
459 pub fn build_config(
460 sandbox: &synwire_core::agents::sandbox::SandboxConfig,
461 command: impl Into<String>,
462 args: Vec<String>,
463 ) -> ContainerConfig {
464 use synwire_core::agents::sandbox::SeccompProfile;
465
466 let network_enabled = sandbox.network.as_ref().is_some_and(|n| n.enabled);
467
468 let mut clone_flags = vec![
469 CloneFlag::NewUts,
470 CloneFlag::NewIpc,
471 CloneFlag::NewNs,
472 CloneFlag::NewCgroup,
473 CloneFlag::NewPid,
474 ];
475 if !network_enabled {
476 clone_flags.push(CloneFlag::NewNet);
477 }
478
479 let bind_mounts = sandbox
481 .filesystem
482 .as_ref()
483 .map(|fs| {
484 let mut mounts: Vec<BindMount> = fs
485 .allow_write
486 .iter()
487 .filter_map(|p| {
488 let abs = to_absolute(p)?;
489 Some(BindMount {
490 source: abs.clone(),
491 target: abs,
492 read_only: false,
493 })
494 })
495 .collect();
496 if fs.inherit_readable {
497 mounts.push(BindMount {
498 source: "/".into(),
499 target: "/".into(),
500 read_only: true,
501 });
502 }
503 mounts
504 })
505 .unwrap_or_default();
506
507 let mut env: std::collections::HashMap<String, String> = if sandbox.env.inherit_parent {
509 std::env::vars().collect()
510 } else {
511 std::collections::HashMap::new()
512 };
513 for k in &sandbox.env.unset {
514 let _ = env.remove(k);
515 }
516 env.extend(sandbox.env.set.clone());
517
518 let seccomp = match &sandbox.security.seccomp {
520 SeccompProfile::Unconfined => ContainerSeccomp::Unconfined,
521 SeccompProfile::Localhost { path } => {
522 ContainerSeccomp::Localhost { path: path.clone() }
523 }
524 SeccompProfile::RuntimeDefault | _ => ContainerSeccomp::RuntimeDefault,
525 };
526
527 let security = ContainerSecurity {
528 seccomp,
529 capabilities_drop: sandbox.security.capabilities.drop.clone(),
530 capabilities_add: sandbox.security.capabilities.add.clone(),
531 no_new_privileges: sandbox.security.no_new_privileges,
532 run_as_user: sandbox.security.run_as_user,
533 run_as_group: sandbox.security.run_as_group,
534 };
535
536 ContainerConfig {
537 clone_flags,
538 network_isolation: !network_enabled,
539 user_namespace: true,
540 cgroup_namespace: true,
541 bind_mounts,
542 cgroup_path: None,
543 security,
544 command: command.into(),
545 args,
546 env,
547 }
548 }
549
550 fn build_run_command(
557 &self,
558 bundle: &tempfile::TempDir,
559 container_id: &str,
560 console_socket: Option<&Path>,
561 ) -> Command {
562 let mut cmd = Command::new(&self.runtime_path);
563
564 if self.runtime_kind == OciRuntime::Gvisor {
566 let platform_flag = match self.gvisor_platform {
567 GvisorPlatform::Systrap => "--platform=systrap",
568 GvisorPlatform::Ptrace => "--platform=ptrace",
569 };
570 let _cmd = cmd
571 .arg("--rootless")
572 .arg("--network=host")
573 .arg(platform_flag);
574 }
575
576 let _cmd = cmd.arg("run");
577
578 if let Some(sock) = console_socket {
579 let _cmd = cmd.arg("--console-socket").arg(sock);
580 }
581
582 let _cmd = cmd.arg("--bundle").arg(bundle.path()).arg(container_id);
583
584 cmd
585 }
586
587 fn prepare_bundle(
589 &self,
590 config: &ContainerConfig,
591 terminal: bool,
592 ) -> Result<(tempfile::TempDir, String), SandboxError> {
593 let bundle = tempfile::TempDir::with_prefix("synwire-").map_err(|e| {
594 SandboxError::RuntimeFailed {
595 reason: format!("create bundle dir: {e}"),
596 }
597 })?;
598 let rootfs = bundle.path().join("rootfs");
599 let container_id = uuid::Uuid::new_v4().to_string();
600
601 let passwd_path = bundle.path().join("passwd");
604 let group_path = bundle.path().join("group");
605 generate_user_files(&passwd_path, &group_path).map_err(|e| {
606 SandboxError::RuntimeFailed {
607 reason: format!("generate user files: {e}"),
608 }
609 })?;
610
611 let spec = build_oci_spec(
612 config,
613 terminal,
614 &passwd_path,
615 &group_path,
616 self.runtime_kind,
617 )
618 .map_err(|e| SandboxError::RuntimeFailed {
619 reason: format!("build OCI spec: {e}"),
620 })?;
621
622 prepare_rootfs(&rootfs, &spec).map_err(|e| SandboxError::RuntimeFailed {
624 reason: format!("prepare rootfs: {e}"),
625 })?;
626
627 let spec_json = serde_json::to_string_pretty(&spec).map_err(SandboxError::SerdeError)?;
628 std::fs::write(bundle.path().join("config.json"), spec_json).map_err(|e| {
629 SandboxError::RuntimeFailed {
630 reason: format!("write config.json: {e}"),
631 }
632 })?;
633
634 Ok((bundle, container_id))
635 }
636}
637
638fn parse_capability(name: &str) -> Option<Capability> {
643 let canon = format!("CAP_{}", name.trim_start_matches("CAP_"));
644 serde_json::from_value(serde_json::Value::String(canon)).ok()
646}
647
648#[allow(clippy::too_many_lines)]
650fn build_oci_spec(
651 config: &ContainerConfig,
652 terminal: bool,
653 passwd_path: &Path,
654 group_path: &Path,
655 runtime: OciRuntime,
656) -> Result<Spec, oci_spec::OciSpecError> {
657 let uid = nix::unistd::getuid().as_raw();
658 let gid = nix::unistd::getgid().as_raw();
659
660 let mut args = vec![config.command.clone()];
662 args.extend(config.args.clone());
663
664 let env: Vec<String> = config.env.iter().map(|(k, v)| format!("{k}={v}")).collect();
666
667 let mut namespaces = Vec::new();
669 for flag in &config.clone_flags {
670 let ns_type = match flag {
671 CloneFlag::NewPid => LinuxNamespaceType::Pid,
672 CloneFlag::NewUts => LinuxNamespaceType::Uts,
673 CloneFlag::NewIpc => LinuxNamespaceType::Ipc,
674 CloneFlag::NewNs => LinuxNamespaceType::Mount,
675 CloneFlag::NewCgroup => LinuxNamespaceType::Cgroup,
676 CloneFlag::NewNet => LinuxNamespaceType::Network,
677 CloneFlag::NewUser => continue, };
679 namespaces.push(LinuxNamespaceBuilder::default().typ(ns_type).build()?);
680 }
681 if config.user_namespace && runtime != OciRuntime::Gvisor {
684 namespaces.push(
685 LinuxNamespaceBuilder::default()
686 .typ(LinuxNamespaceType::User)
687 .build()?,
688 );
689 }
690
691 let mut mounts = essential_mounts()?;
693 for bm in &config.bind_mounts {
694 let mut opts = vec!["rbind".to_string()];
695 if bm.read_only {
696 opts.push("ro".to_string());
697 }
698 mounts.push(
699 MountBuilder::default()
700 .destination(&bm.target)
701 .typ("bind")
702 .source(&bm.source)
703 .options(opts)
704 .build()?,
705 );
706 }
707 if config.bind_mounts.is_empty() {
709 for dir in &[
710 "/usr", "/bin", "/sbin", "/lib", "/lib64", "/etc", "/home", "/tmp",
711 ] {
712 if Path::new(dir).exists() {
713 mounts.push(
714 MountBuilder::default()
715 .destination(*dir)
716 .typ("bind")
717 .source(*dir)
718 .options(vec!["rbind".into(), "ro".into()])
719 .build()?,
720 );
721 }
722 }
723 }
724
725 mounts.push(
729 MountBuilder::default()
730 .destination("/etc/passwd")
731 .typ("bind")
732 .source(passwd_path)
733 .options(vec!["bind".into(), "ro".into()])
734 .build()?,
735 );
736 mounts.push(
737 MountBuilder::default()
738 .destination("/etc/group")
739 .typ("bind")
740 .source(group_path)
741 .options(vec!["bind".into(), "ro".into()])
742 .build()?,
743 );
744
745 let caps = build_capabilities(&config.security)?;
747
748 let seccomp = build_seccomp(&config.security.seccomp)?;
750
751 let masked_paths = vec![
752 "/proc/acpi".into(),
753 "/proc/asound".into(),
754 "/proc/kcore".into(),
755 "/proc/keys".into(),
756 "/proc/latency_stats".into(),
757 "/proc/timer_list".into(),
758 "/proc/timer_stats".into(),
759 "/proc/sched_debug".into(),
760 "/proc/scsi".into(),
761 "/sys/firmware".into(),
762 "/sys/devices/virtual/powercap".into(),
763 ];
764 let readonly_paths = vec![
765 "/proc/bus".into(),
766 "/proc/fs".into(),
767 "/proc/irq".into(),
768 "/proc/sys".into(),
769 "/proc/sysrq-trigger".into(),
770 ];
771
772 let mut linux_builder = LinuxBuilder::default();
773 linux_builder = linux_builder
774 .namespaces(namespaces)
775 .masked_paths(masked_paths)
776 .readonly_paths(readonly_paths);
777
778 if config.user_namespace && runtime != OciRuntime::Gvisor {
781 linux_builder = linux_builder
791 .uid_mappings(vec![
792 LinuxIdMappingBuilder::default()
793 .container_id(0u32)
794 .host_id(uid)
795 .size(1u32)
796 .build()?,
797 ])
798 .gid_mappings(vec![
799 LinuxIdMappingBuilder::default()
800 .container_id(0u32)
801 .host_id(gid)
802 .size(1u32)
803 .build()?,
804 ]);
805 }
806
807 if runtime != OciRuntime::Gvisor
811 && let Some(sec) = seccomp
812 {
813 linux_builder = linux_builder.seccomp(sec);
814 }
815
816 let linux = linux_builder.build()?;
817
818 #[allow(clippy::similar_names)]
821 let container_uid = if config.user_namespace { 0 } else { uid };
822 #[allow(clippy::similar_names)]
823 let container_gid = if config.user_namespace { 0 } else { gid };
824
825 let user = UserBuilder::default()
826 .uid(config.security.run_as_user.unwrap_or(container_uid))
827 .gid(config.security.run_as_group.unwrap_or(container_gid))
828 .build()?;
829
830 let process = ProcessBuilder::default()
831 .terminal(terminal)
832 .user(user)
833 .args(args)
834 .env(env)
835 .cwd("/")
836 .capabilities(caps)
837 .no_new_privileges(config.security.no_new_privileges)
838 .build()?;
839
840 let root = RootBuilder::default()
841 .path("rootfs")
842 .readonly(true)
843 .build()?;
844
845 SpecBuilder::default()
846 .version("1.0.2")
847 .process(process)
848 .root(root)
849 .hostname("synwire")
850 .mounts(mounts)
851 .linux(linux)
852 .build()
853}
854
855fn essential_mounts() -> Result<Vec<Mount>, oci_spec::OciSpecError> {
857 Ok(vec![
858 MountBuilder::default()
859 .destination("/proc")
860 .typ("proc")
861 .source("proc")
862 .options(vec!["nosuid".into(), "noexec".into(), "nodev".into()])
863 .build()?,
864 MountBuilder::default()
865 .destination("/dev")
866 .typ("tmpfs")
867 .source("tmpfs")
868 .options(vec![
869 "nosuid".into(),
870 "strictatime".into(),
871 "mode=755".into(),
872 "size=65536k".into(),
873 ])
874 .build()?,
875 MountBuilder::default()
876 .destination("/dev/pts")
877 .typ("devpts")
878 .source("devpts")
879 .options(vec![
880 "nosuid".into(),
881 "noexec".into(),
882 "newinstance".into(),
883 "ptmxmode=0666".into(),
884 "mode=0620".into(),
885 ])
886 .build()?,
887 MountBuilder::default()
888 .destination("/dev/shm")
889 .typ("tmpfs")
890 .source("shm")
891 .options(vec![
892 "nosuid".into(),
893 "noexec".into(),
894 "nodev".into(),
895 "mode=1777".into(),
896 "size=65536k".into(),
897 ])
898 .build()?,
899 MountBuilder::default()
900 .destination("/dev/mqueue")
901 .typ("mqueue")
902 .source("mqueue")
903 .options(vec!["nosuid".into(), "noexec".into(), "nodev".into()])
904 .build()?,
905 MountBuilder::default()
906 .destination("/sys")
907 .typ("none")
908 .source("/sys")
909 .options(vec![
910 "rbind".into(),
911 "nosuid".into(),
912 "noexec".into(),
913 "nodev".into(),
914 "ro".into(),
915 ])
916 .build()?,
917 ])
918}
919
920fn build_capabilities(
922 security: &ContainerSecurity,
923) -> Result<oci_spec::runtime::LinuxCapabilities, oci_spec::OciSpecError> {
924 let drop_all = security.capabilities_drop.iter().any(|c| c == "ALL");
925 let caps: oci_spec::runtime::Capabilities = if drop_all {
926 security
927 .capabilities_add
928 .iter()
929 .filter_map(|c| parse_capability(c))
930 .collect()
931 } else {
932 let mut caps: oci_spec::runtime::Capabilities = [
941 Capability::Kill,
942 Capability::NetBindService,
943 Capability::Setpcap,
944 ]
945 .into_iter()
946 .collect();
947
948 for drop in &security.capabilities_drop {
949 if let Some(cap) = parse_capability(drop) {
950 let _ = caps.remove(&cap);
951 }
952 }
953 caps
954 };
955
956 LinuxCapabilitiesBuilder::default()
957 .bounding(caps.clone())
958 .effective(caps.clone())
959 .inheritable(caps.clone())
960 .permitted(caps.clone())
961 .ambient(caps)
962 .build()
963}
964
965fn build_seccomp(
967 seccomp: &ContainerSeccomp,
968) -> Result<Option<oci_spec::runtime::LinuxSeccomp>, oci_spec::OciSpecError> {
969 match seccomp {
970 ContainerSeccomp::Unconfined => Ok(None),
971 ContainerSeccomp::RuntimeDefault => {
972 let syscall = LinuxSyscallBuilder::default()
973 .names(vec![
974 "kexec_file_load".into(),
975 "kexec_load".into(),
976 "open_by_handle_at".into(),
977 "perf_event_open".into(),
978 "process_vm_readv".into(),
979 "process_vm_writev".into(),
980 "ptrace".into(),
981 "reboot".into(),
982 "request_key".into(),
983 "set_mempolicy".into(),
984 "swapon".into(),
985 "swapoff".into(),
986 "syslog".into(),
987 "umount2".into(),
988 "unshare".into(),
989 "uselib".into(),
990 "userfaultfd".into(),
991 ])
992 .action(LinuxSeccompAction::ScmpActErrno)
993 .errno_ret(1u32)
994 .build()?;
995
996 Ok(Some(
997 LinuxSeccompBuilder::default()
998 .default_action(LinuxSeccompAction::ScmpActAllow)
999 .architectures(vec![
1000 Arch::ScmpArchX86_64,
1001 Arch::ScmpArchX86,
1002 Arch::ScmpArchAarch64,
1003 ])
1004 .syscalls(vec![syscall])
1005 .build()?,
1006 ))
1007 }
1008 ContainerSeccomp::Localhost { path } => {
1009 Ok(std::fs::read_to_string(path)
1011 .ok()
1012 .and_then(|s| serde_json::from_str(&s).ok()))
1013 }
1014 }
1015}
1016
1017fn prepare_rootfs(rootfs: &Path, spec: &Spec) -> std::io::Result<()> {
1019 std::fs::create_dir_all(rootfs)?;
1020 if let Some(mounts) = spec.mounts() {
1021 for mount in mounts {
1022 let dest = mount.destination();
1023 let target = rootfs.join(dest.strip_prefix("/").unwrap_or(dest));
1024 std::fs::create_dir_all(&target)?;
1025 }
1026 }
1027 Ok(())
1028}
1029
1030fn recv_pty_controller(stream: &std::os::unix::net::UnixStream) -> Result<OwnedFd, SandboxError> {
1035 use nix::sys::socket::{ControlMessageOwned, MsgFlags, recvmsg};
1036 use std::os::fd::{AsRawFd, FromRawFd};
1037
1038 let mut buf = [0u8; 1];
1039 let mut iov = [std::io::IoSliceMut::new(&mut buf)];
1040 let mut cmsg_buf = nix::cmsg_space!(std::os::fd::RawFd);
1041
1042 let msg = recvmsg::<()>(
1043 stream.as_raw_fd(),
1044 &mut iov,
1045 Some(&mut cmsg_buf),
1046 MsgFlags::empty(),
1047 )
1048 .map_err(|e| SandboxError::RuntimeFailed {
1049 reason: format!("recvmsg on console socket: {e}"),
1050 })?;
1051
1052 let iter = msg.cmsgs().map_err(|e| SandboxError::RuntimeFailed {
1053 reason: format!("parse control messages: {e}"),
1054 })?;
1055 for cmsg in iter {
1056 if let ControlMessageOwned::ScmRights(fds) = cmsg
1057 && let Some(&raw_fd) = fds.first()
1058 {
1059 #[allow(unsafe_code)]
1064 let owned = unsafe { std::os::fd::OwnedFd::from_raw_fd(raw_fd) };
1065 return Ok(owned);
1066 }
1067 }
1068
1069 Err(SandboxError::RuntimeFailed {
1070 reason: "no PTY controller fd received from runtime".into(),
1071 })
1072}
1073
1074#[allow(clippy::doc_lazy_continuation)]
1082fn resolve_gvisor_platform(runsc_path: &Path) -> GvisorPlatform {
1083 use std::sync::atomic::Ordering;
1084
1085 let cached = GVISOR_PLATFORM_CACHE.load(Ordering::Relaxed);
1086 if cached == PLATFORM_SYSTRAP {
1087 return GvisorPlatform::Systrap;
1088 }
1089 if cached == PLATFORM_PTRACE {
1090 return GvisorPlatform::Ptrace;
1091 }
1092
1093 debug!("probing gVisor systrap platform");
1095 if probe_gvisor_platform(runsc_path, "systrap") {
1096 debug!("gVisor systrap platform works — using for all future containers");
1097 GVISOR_PLATFORM_CACHE.store(PLATFORM_SYSTRAP, Ordering::Relaxed);
1098 return GvisorPlatform::Systrap;
1099 }
1100
1101 if probe_gvisor_platform(runsc_path, "ptrace") {
1103 warn!(
1104 "gVisor systrap platform failed (likely missing CAP_SYS_PTRACE in \
1105 rootless+host-network mode — see runsc/sandbox/sandbox.go \
1106 ConfigureCmdForRootless). Falling back to ptrace platform for all \
1107 future gVisor containers in this process."
1108 );
1109 GVISOR_PLATFORM_CACHE.store(PLATFORM_PTRACE, Ordering::Relaxed);
1110 return GvisorPlatform::Ptrace;
1111 }
1112
1113 warn!("gVisor probe failed for both systrap and ptrace — defaulting to ptrace");
1115 GVISOR_PLATFORM_CACHE.store(PLATFORM_PTRACE, Ordering::Relaxed);
1116 GvisorPlatform::Ptrace
1117}
1118
1119fn probe_gvisor_platform(runsc_path: &Path, platform: &str) -> bool {
1121 let Ok(bundle_dir) = tempfile::TempDir::with_prefix("synwire-") else {
1122 return false;
1123 };
1124 let rootfs = bundle_dir.path().join("rootfs");
1125 if std::fs::create_dir_all(&rootfs).is_err() {
1126 return false;
1127 }
1128
1129 let Ok(spec) = build_gvisor_probe_spec() else {
1130 return false;
1131 };
1132
1133 if let Err(_e) = prepare_rootfs(&rootfs, &spec) {
1135 return false;
1136 }
1137
1138 let Ok(spec_json) = serde_json::to_string_pretty(&spec) else {
1139 return false;
1140 };
1141 if std::fs::write(bundle_dir.path().join("config.json"), spec_json).is_err() {
1142 return false;
1143 }
1144
1145 let container_id = format!("probe-{}", uuid::Uuid::new_v4());
1146 let result = std::process::Command::new(runsc_path)
1147 .arg("--rootless")
1148 .arg("--network=host")
1149 .arg(format!("--platform={platform}"))
1150 .arg("run")
1151 .arg("--bundle")
1152 .arg(bundle_dir.path())
1153 .arg(&container_id)
1154 .stdin(std::process::Stdio::null())
1155 .stdout(std::process::Stdio::null())
1156 .stderr(std::process::Stdio::null())
1157 .status();
1158
1159 match result {
1160 Ok(status) => status.success(),
1161 Err(_) => false,
1162 }
1163}
1164
1165fn build_gvisor_probe_spec() -> Result<Spec, oci_spec::OciSpecError> {
1167 let uid = nix::unistd::getuid().as_raw();
1168 let gid = nix::unistd::getgid().as_raw();
1169
1170 let empty_caps: oci_spec::runtime::Capabilities = std::collections::HashSet::default();
1171 let caps = LinuxCapabilitiesBuilder::default()
1172 .bounding(empty_caps.clone())
1173 .effective(empty_caps.clone())
1174 .inheritable(empty_caps.clone())
1175 .permitted(empty_caps.clone())
1176 .ambient(empty_caps)
1177 .build()?;
1178
1179 let process = ProcessBuilder::default()
1180 .terminal(false)
1181 .user(UserBuilder::default().uid(0u32).gid(0u32).build()?)
1182 .args(vec!["/bin/true".into()])
1183 .env(vec!["PATH=/usr/bin:/bin".into()])
1184 .cwd("/")
1185 .capabilities(caps)
1186 .no_new_privileges(true)
1187 .build()?;
1188
1189 let root = RootBuilder::default()
1190 .path("rootfs")
1191 .readonly(true)
1192 .build()?;
1193
1194 let namespaces = vec![
1195 LinuxNamespaceBuilder::default()
1196 .typ(LinuxNamespaceType::Pid)
1197 .build()?,
1198 LinuxNamespaceBuilder::default()
1199 .typ(LinuxNamespaceType::Mount)
1200 .build()?,
1201 LinuxNamespaceBuilder::default()
1202 .typ(LinuxNamespaceType::Ipc)
1203 .build()?,
1204 LinuxNamespaceBuilder::default()
1205 .typ(LinuxNamespaceType::Uts)
1206 .build()?,
1207 LinuxNamespaceBuilder::default()
1208 .typ(LinuxNamespaceType::Cgroup)
1209 .build()?,
1210 ];
1211
1212 let linux = LinuxBuilder::default()
1213 .namespaces(namespaces)
1214 .uid_mappings(vec![
1215 LinuxIdMappingBuilder::default()
1216 .container_id(0u32)
1217 .host_id(uid)
1218 .size(1u32)
1219 .build()?,
1220 ])
1221 .gid_mappings(vec![
1222 LinuxIdMappingBuilder::default()
1223 .container_id(0u32)
1224 .host_id(gid)
1225 .size(1u32)
1226 .build()?,
1227 ])
1228 .build()?;
1229
1230 SpecBuilder::default()
1231 .version("1.0.2")
1232 .process(process)
1233 .root(root)
1234 .mounts(probe_mounts()?)
1235 .linux(linux)
1236 .build()
1237}
1238
1239fn probe_mounts() -> Result<Vec<Mount>, oci_spec::OciSpecError> {
1241 let mut mounts = vec![
1242 MountBuilder::default()
1243 .destination("/proc")
1244 .typ("proc")
1245 .source("proc")
1246 .options(vec!["nosuid".into(), "noexec".into(), "nodev".into()])
1247 .build()?,
1248 MountBuilder::default()
1249 .destination("/dev")
1250 .typ("tmpfs")
1251 .source("tmpfs")
1252 .options(vec![
1253 "nosuid".into(),
1254 "strictatime".into(),
1255 "mode=755".into(),
1256 "size=65536k".into(),
1257 ])
1258 .build()?,
1259 ];
1260 for dir in &["/usr", "/bin", "/sbin", "/lib", "/lib64"] {
1261 if Path::new(dir).exists() {
1262 mounts.push(
1263 MountBuilder::default()
1264 .destination(*dir)
1265 .typ("bind")
1266 .source(*dir)
1267 .options(vec!["rbind".into(), "ro".into()])
1268 .build()?,
1269 );
1270 }
1271 }
1272 Ok(mounts)
1273}
1274
1275fn generate_user_files(passwd_path: &Path, group_path: &Path) -> std::io::Result<()> {
1281 let username = std::env::var("USER")
1283 .or_else(|_| std::env::var("LOGNAME"))
1284 .unwrap_or_else(|_| "user".into());
1285
1286 let home = std::env::var("HOME").unwrap_or_else(|_| format!("/home/{username}"));
1287
1288 let shell = std::env::var("SHELL").unwrap_or_else(|_| "/bin/sh".into());
1289
1290 let gid = nix::unistd::getgid().as_raw();
1291
1292 let groupname = resolve_group_name(gid).unwrap_or_else(|| username.clone());
1294
1295 let passwd = format!(
1302 "{username}:x:0:0::{home}:{shell}\nnobody:x:65534:65534:nobody:/nonexistent:/sbin/nologin\n"
1303 );
1304
1305 let group = format!("{groupname}:x:0:{username}\nnobody:x:65534:\n");
1307
1308 std::fs::write(passwd_path, passwd)?;
1309 std::fs::write(group_path, group)?;
1310 Ok(())
1311}
1312
1313fn resolve_group_name(gid: u32) -> Option<String> {
1315 let content = std::fs::read_to_string("/etc/group").ok()?;
1316 for line in content.lines() {
1317 let mut parts = line.splitn(4, ':');
1318 let name = parts.next()?;
1319 let _ = parts.next(); let group_gid: u32 = parts.next()?.parse().ok()?;
1321 if group_gid == gid {
1322 return Some(name.to_string());
1323 }
1324 }
1325 None
1326}
1327
1328fn which_binary(name: &str) -> Result<PathBuf, ()> {
1331 which::which(name).map_err(|_| ())
1332}
1333
1334fn to_absolute(path: &str) -> Option<String> {
1335 let p = std::path::Path::new(path);
1336 if p.is_absolute() {
1337 return Some(path.to_string());
1338 }
1339 std::env::current_dir()
1340 .ok()
1341 .map(|cwd| cwd.join(p).display().to_string())
1342}