Skip to main content

nucleus/isolation/
attach.rs

1use crate::container::{ContainerState, ProcessIdentity};
2use crate::error::{NucleusError, Result};
3use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
4use nix::unistd::{fork, ForkResult, Pid};
5use std::ffi::CString;
6use std::fs::File;
7use std::os::unix::io::AsRawFd;
8use std::thread;
9use std::time::{Duration, Instant};
10use tracing::info;
11
12/// Attach to a running container by entering its namespaces
13pub struct ContainerAttach;
14
15/// Minimal probe operations that can be executed after joining container namespaces.
16pub enum NamespaceProbe {
17    Exec(Vec<String>),
18    TcpConnect(u16),
19}
20
21/// Run trusted helper actions inside a container's namespaces.
22pub struct NamespaceCommandRunner;
23
24impl ContainerAttach {
25    /// Attach to a running container and execute a command
26    ///
27    /// Opens namespace FDs from /proc/`<pid>`/ns/\*, forks, calls setns(2) for each,
28    /// then execve the command. Parent waits with waitpid.
29    pub fn attach(state: &ContainerState, command: Vec<String>) -> Result<i32> {
30        if !state.is_running() {
31            return Err(NucleusError::AttachError(format!(
32                "Container {} is not running",
33                state.id
34            )));
35        }
36
37        // Validate caller owns the container (or is root)
38        let current_uid = nix::unistd::Uid::effective().as_raw();
39        if current_uid != 0 && current_uid != state.creator_uid {
40            return Err(NucleusError::AttachError(format!(
41                "Permission denied: container {} owned by UID {}, caller is UID {}",
42                state.id, state.creator_uid, current_uid
43            )));
44        }
45
46        // gVisor containers run under runsc; the host PID is the gVisor sandbox
47        // supervisor, not the guest workload. nsenter cannot reach the guest.
48        if state.using_gvisor {
49            return Err(NucleusError::AttachError(format!(
50                "Container {} uses gVisor runtime; attach is not supported \
51                 (use 'runsc exec' to interact with the guest workload)",
52                state.id
53            )));
54        }
55
56        let pid = state.pid;
57        info!("Attaching to container {} (PID {})", state.id, pid);
58
59        let ns_fds = Self::open_namespace_fds(pid, state.rootless)?;
60
61        // Fork child
62        match unsafe { fork() }
63            .map_err(|e| NucleusError::AttachError(format!("Fork failed: {}", e)))?
64        {
65            ForkResult::Parent { child } => {
66                // Parent: wait for child
67                Self::wait_for_child(child)
68            }
69            ForkResult::Child => {
70                // Child: enter namespaces and exec
71                match Self::enter_and_exec(&ns_fds, &command) {
72                    Ok(_) => unreachable!(),
73                    Err(e) => {
74                        eprintln!("Attach failed: {}", e);
75                        std::process::exit(1);
76                    }
77                }
78            }
79        }
80    }
81
82    fn enter_and_exec(ns_fds: &[(String, File)], command: &[String]) -> Result<()> {
83        if command.is_empty() {
84            return Err(NucleusError::AttachError(
85                "No command specified for attach".to_string(),
86            ));
87        }
88
89        Self::enter_namespaces(ns_fds)?;
90        Self::apply_exec_hardening()?;
91        let env = Self::default_exec_env()?;
92        Self::exec_with_env(command, &env)
93    }
94
95    fn open_namespace_fds(pid: u32, rootless: bool) -> Result<Vec<(String, File)>> {
96        let ns_types = if rootless {
97            &["user", "pid", "mnt", "net", "uts", "ipc", "cgroup"][..]
98        } else {
99            &["pid", "mnt", "net", "uts", "ipc", "cgroup"][..]
100        };
101        let mut ns_fds: Vec<(String, File)> = Vec::new();
102
103        for ns in ns_types {
104            let ns_path = format!("/proc/{}/ns/{}", pid, ns);
105            match File::open(&ns_path) {
106                Ok(f) => ns_fds.push(((*ns).to_string(), f)),
107                Err(e) => {
108                    // Some namespaces may not be available
109                    info!("Skipping namespace {}: {}", ns, e);
110                }
111            }
112        }
113
114        if ns_fds.is_empty() {
115            return Err(NucleusError::AttachError(
116                "Could not open any namespace FDs".to_string(),
117            ));
118        }
119
120        Ok(ns_fds)
121    }
122
123    fn enter_namespaces(ns_fds: &[(String, File)]) -> Result<()> {
124        // Enter user namespace first (required before other setns calls in
125        // rootless containers), then non-PID namespaces.
126        // PID namespace membership only applies to future children after setns().
127        let mut pid_ns_fd: Option<&File> = None;
128
129        // Phase 1: user namespace (must be first)
130        for (ns_name, fd) in ns_fds {
131            if ns_name == "user" {
132                let ret = unsafe { libc::setns(fd.as_raw_fd(), libc::CLONE_NEWUSER) };
133                if ret != 0 {
134                    let err = std::io::Error::last_os_error();
135                    return Err(NucleusError::AttachError(format!(
136                        "setns(user) failed: {}",
137                        err
138                    )));
139                }
140                info!("Entered user namespace");
141            }
142        }
143
144        // Phase 2: non-PID, non-user namespaces
145        for (ns_name, fd) in ns_fds {
146            if ns_name == "pid" {
147                pid_ns_fd = Some(fd);
148                continue;
149            }
150            if ns_name == "user" {
151                continue; // already joined above
152            }
153
154            let nstype = Self::ns_name_to_clone_flag(ns_name);
155            let raw_fd = fd.as_raw_fd();
156            let ret = unsafe { libc::setns(raw_fd, nstype) };
157            if ret != 0 {
158                let err = std::io::Error::last_os_error();
159                return Err(NucleusError::AttachError(format!(
160                    "setns({}) failed: {}",
161                    ns_name, err
162                )));
163            }
164            info!("Entered {} namespace", ns_name);
165        }
166
167        if let Some(fd) = pid_ns_fd {
168            let ret = unsafe { libc::setns(fd.as_raw_fd(), libc::CLONE_NEWPID) };
169            if ret != 0 {
170                let err = std::io::Error::last_os_error();
171                return Err(NucleusError::AttachError(format!(
172                    "setns(pid) failed: {}",
173                    err
174                )));
175            }
176            info!("Entered pid namespace");
177
178            // A second fork is required for PID namespace to take effect.
179            match unsafe { fork() }.map_err(|e| {
180                NucleusError::AttachError(format!("Fork failed after setns(pid): {}", e))
181            })? {
182                ForkResult::Parent { child } => {
183                    let code = Self::wait_for_child(child)?;
184                    std::process::exit(code);
185                }
186                ForkResult::Child => {
187                    // Continue and exec below.
188                }
189            }
190        }
191
192        // Change to root directory of the namespace
193        nix::unistd::chdir("/")
194            .map_err(|e| NucleusError::AttachError(format!("chdir(\"/\") failed: {}", e)))?;
195
196        Ok(())
197    }
198
199    fn apply_exec_hardening() -> Result<()> {
200        // Apply security hardening before exec: no_new_privs + capability drop
201        let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
202        if ret != 0 {
203            return Err(NucleusError::AttachError(format!(
204                "Failed to set PR_SET_NO_NEW_PRIVS: {}",
205                std::io::Error::last_os_error()
206            )));
207        }
208
209        let mut cap_mgr = crate::security::CapabilityManager::new();
210        cap_mgr.drop_all().map_err(|e| {
211            NucleusError::AttachError(format!("Failed to drop capabilities: {}", e))
212        })?;
213
214        Ok(())
215    }
216
217    fn default_exec_env() -> Result<Vec<CString>> {
218        Ok(vec![
219            CString::new("PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin")
220                .map_err(|e| NucleusError::AttachError(format!("Invalid PATH env: {}", e)))?,
221            CString::new("TERM=xterm")
222                .map_err(|e| NucleusError::AttachError(format!("Invalid TERM env: {}", e)))?,
223            CString::new("HOME=/")
224                .map_err(|e| NucleusError::AttachError(format!("Invalid HOME env: {}", e)))?,
225        ])
226    }
227
228    fn exec_with_env(command: &[String], env: &[CString]) -> Result<()> {
229        let program = CString::new(command[0].as_str())
230            .map_err(|e| NucleusError::AttachError(format!("Invalid program name: {}", e)))?;
231
232        let args: std::result::Result<Vec<CString>, _> = command
233            .iter()
234            .map(|arg| CString::new(arg.as_str()))
235            .collect();
236        let args =
237            args.map_err(|e| NucleusError::AttachError(format!("Invalid argument: {}", e)))?;
238
239        nix::unistd::execve::<CString, CString>(&program, &args, env)
240            .map_err(|e| NucleusError::AttachError(format!("execve failed: {}", e)))?;
241
242        Ok(())
243    }
244
245    fn ns_name_to_clone_flag(name: &str) -> libc::c_int {
246        match name {
247            "user" => libc::CLONE_NEWUSER,
248            "pid" => libc::CLONE_NEWPID,
249            "mnt" => libc::CLONE_NEWNS,
250            "net" => libc::CLONE_NEWNET,
251            "uts" => libc::CLONE_NEWUTS,
252            "ipc" => libc::CLONE_NEWIPC,
253            "cgroup" => libc::CLONE_NEWCGROUP,
254            // Unknown namespace type: use 0 (kernel infers from FD)
255            _ => 0,
256        }
257    }
258
259    fn wait_for_child(child: Pid) -> Result<i32> {
260        loop {
261            match waitpid(child, None) {
262                Ok(WaitStatus::Exited(_, code)) => return Ok(code),
263                Ok(WaitStatus::Signaled(_, signal, _)) => return Ok(128 + signal as i32),
264                Err(nix::errno::Errno::EINTR) => continue,
265                Err(e) => {
266                    return Err(NucleusError::AttachError(format!("waitpid failed: {}", e)));
267                }
268                _ => continue,
269            }
270        }
271    }
272}
273
274impl NamespaceCommandRunner {
275    /// Run a probe-style helper inside the target container's namespaces.
276    ///
277    /// This enters namespaces in-process, then immediately applies
278    /// `PR_SET_NO_NEW_PRIVS` and drops capabilities before executing any
279    /// container-controlled binary. That avoids running helpers via a privileged
280    /// host `nsenter` process.
281    pub fn run(
282        pid: u32,
283        rootless: bool,
284        using_gvisor: bool,
285        probe: NamespaceProbe,
286        process_identity: Option<&ProcessIdentity>,
287        timeout: Option<Duration>,
288    ) -> Result<bool> {
289        if using_gvisor {
290            return Err(NucleusError::ExecError(
291                "Namespace-local exec probes are unsupported for gVisor containers".to_string(),
292            ));
293        }
294
295        let ns_fds = ContainerAttach::open_namespace_fds(pid, rootless)?;
296
297        match unsafe { fork() }.map_err(|e| {
298            NucleusError::ExecError(format!("Failed to fork namespace helper: {}", e))
299        })? {
300            ForkResult::Parent { child } => Self::wait_for_probe(child, timeout),
301            ForkResult::Child => {
302                let exit_code =
303                    match Self::enter_and_run(&ns_fds, probe, process_identity, rootless) {
304                        Ok(true) => 0,
305                        Ok(false) => 1,
306                        Err(e) => {
307                            eprintln!("Namespace helper failed: {}", e);
308                            125
309                        }
310                    };
311                std::process::exit(exit_code);
312            }
313        }
314    }
315
316    fn enter_and_run(
317        ns_fds: &[(String, File)],
318        probe: NamespaceProbe,
319        process_identity: Option<&ProcessIdentity>,
320        rootless: bool,
321    ) -> Result<bool> {
322        ContainerAttach::enter_namespaces(ns_fds)?;
323        ContainerAttach::apply_exec_hardening()?;
324
325        match probe {
326            NamespaceProbe::Exec(command) => {
327                if let Some(identity) = process_identity {
328                    crate::container::Container::apply_process_identity_to_current_process(
329                        identity, rootless,
330                    )?;
331                }
332                let env = ContainerAttach::default_exec_env()?;
333                ContainerAttach::exec_with_env(&command, &env)?;
334                unreachable!()
335            }
336            NamespaceProbe::TcpConnect(port) => {
337                let addr = std::net::SocketAddr::from(([127, 0, 0, 1], port));
338                Ok(std::net::TcpStream::connect_timeout(&addr, Duration::from_secs(2)).is_ok())
339            }
340        }
341    }
342
343    fn wait_for_probe(child: Pid, timeout: Option<Duration>) -> Result<bool> {
344        let start = Instant::now();
345        loop {
346            match waitpid(child, Some(WaitPidFlag::WNOHANG)) {
347                Ok(WaitStatus::StillAlive) => {
348                    if let Some(limit) = timeout {
349                        if start.elapsed() >= limit {
350                            let _ =
351                                nix::sys::signal::kill(child, nix::sys::signal::Signal::SIGKILL);
352                            let _ = waitpid(child, None);
353                            return Ok(false);
354                        }
355                    }
356                    thread::sleep(Duration::from_millis(50));
357                }
358                Ok(WaitStatus::Exited(_, code)) => return Ok(code == 0),
359                Ok(WaitStatus::Signaled(_, _, _)) => return Ok(false),
360                Err(nix::errno::Errno::EINTR) => continue,
361                Err(e) => {
362                    return Err(NucleusError::ExecError(format!(
363                        "Failed waiting for namespace helper: {}",
364                        e
365                    )));
366                }
367                _ => continue,
368            }
369        }
370    }
371}