Skip to main content

libcontainer/process/
container_main_process.rs

1use std::collections::HashMap;
2use std::fs;
3use std::fs::File;
4use std::os::fd::AsRawFd;
5use std::path::PathBuf;
6
7use nix::sys::wait::{WaitStatus, waitpid};
8use nix::unistd::Pid;
9use oci_spec::runtime::{Linux, LinuxNamespaceType};
10#[cfg(feature = "libseccomp")]
11use oci_spec::runtime::{SECCOMP_FD_NAME, VERSION as OCI_VERSION};
12
13use crate::hooks;
14use crate::network::network_device::dev_change_net_namespace;
15use crate::process::args::{ContainerArgs, ContainerType};
16use crate::process::fork::{self, CloneCb};
17use crate::process::intel_rdt::setup_intel_rdt;
18use crate::process::{channel, container_intermediate_process};
19use crate::syscall::SyscallError;
20use crate::user_ns::UserNamespaceConfig;
21
22#[derive(Debug, thiserror::Error)]
23pub enum ProcessError {
24    #[error(transparent)]
25    Channel(#[from] channel::ChannelError),
26    #[error("failed to write deny to setgroups")]
27    SetGroupsDeny(#[source] std::io::Error),
28    #[error(transparent)]
29    UserNamespace(#[from] crate::user_ns::UserNamespaceError),
30    #[error("container state is required")]
31    ContainerStateRequired,
32    #[error("failed to wait for intermediate process")]
33    WaitIntermediateProcess(#[source] nix::Error),
34    #[error(transparent)]
35    IntelRdt(#[from] crate::process::intel_rdt::IntelRdtError),
36    #[error("failed to create intermediate process")]
37    IntermediateProcessFailed(#[source] fork::CloneError),
38    #[error("failed seccomp listener")]
39    #[cfg(feature = "libseccomp")]
40    SeccompListener(#[from] crate::process::seccomp_listener::SeccompListenerError),
41    #[error("failed setup network device")]
42    Network(#[from] crate::network::NetworkError),
43    #[error("failed syscall")]
44    SyscallOther(#[source] SyscallError),
45    #[error("failed hooks {0}")]
46    Hooks(#[from] crate::hooks::HookError),
47    #[error("failed to build OCI state: {0}")]
48    OciStateBuild(String),
49}
50
51type Result<T> = std::result::Result<T, ProcessError>;
52
53pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bool)> {
54    // We use a set of channels to communicate between parent and child process.
55    // Each channel is uni-directional. Because we will pass these channel to
56    // cloned process, we have to be deligent about closing any unused channel.
57    // At minimum, we have to close down any unused senders. The corresponding
58    // receivers will be cleaned up once the senders are closed down.
59    let (mut main_sender, mut main_receiver) = channel::main_channel()?;
60    let mut inter_chan = channel::intermediate_channel()?;
61    let mut init_chan = channel::init_channel()?;
62
63    let cb: CloneCb = {
64        Box::new(|| {
65            if let Err(ret) = prctl::set_name("youki:[1:INTER]") {
66                tracing::error!(?ret, "failed to set name for child process");
67                return ret;
68            }
69
70            match container_intermediate_process::container_intermediate_process(
71                container_args,
72                &mut inter_chan,
73                &mut init_chan,
74                &mut main_sender,
75            ) {
76                Ok(_) => 0,
77                Err(err) => {
78                    tracing::error!("failed to run intermediate process {}", err);
79                    match main_sender.send_error(err.to_string()) {
80                        Ok(_) => {}
81                        Err(e) => {
82                            tracing::error!(
83                                "error in sending intermediate error message {} to main: {}",
84                                err,
85                                e
86                            )
87                        }
88                    }
89                    -1
90                }
91            }
92        })
93    };
94
95    let container_clone_fn = if container_args.as_sibling {
96        fork::container_clone_sibling
97    } else {
98        fork::container_clone
99    };
100
101    let intermediate_pid = container_clone_fn(cb).map_err(|err| {
102        tracing::error!("failed to fork intermediate process: {}", err);
103        ProcessError::IntermediateProcessFailed(err)
104    })?;
105
106    // Close down unused fds. The corresponding fds are duplicated to the
107    // child process during clone.
108    main_sender.close().map_err(|err| {
109        tracing::error!("failed to close unused sender: {}", err);
110        err
111    })?;
112
113    let (mut inter_sender, inter_receiver) = inter_chan;
114    let (mut init_sender, init_receiver) = init_chan;
115
116    // If creating a container with new user namespace, the intermediate process will ask
117    // the main process to set up uid and gid mapping, once the intermediate
118    // process enters into a new user namespace.
119    if let Some(config) = &container_args.user_ns_config {
120        main_receiver.wait_for_mapping_request()?;
121        setup_mapping(config, intermediate_pid)?;
122        inter_sender.mapping_written()?;
123    }
124
125    // At this point, we don't need to send any message to intermediate process anymore,
126    // so we want to close this sender at the earliest point.
127    inter_sender.close().map_err(|err| {
128        tracing::error!("failed to close unused intermediate sender: {}", err);
129        err
130    })?;
131
132    // The intermediate process will send the init pid once it forks the init
133    // process.  The intermediate process should exit after this point.
134    let init_pid = main_receiver.wait_for_intermediate_ready()?;
135    let mut need_to_clean_up_intel_rdt_subdirectory = false;
136
137    if let Some(linux) = container_args.spec.linux() {
138        if let Some(intel_rdt) = linux.intel_rdt() {
139            let container_id = container_args
140                .container
141                .as_ref()
142                .map(|container| container.id());
143            need_to_clean_up_intel_rdt_subdirectory =
144                setup_intel_rdt(container_id, &init_pid, intel_rdt)?;
145        }
146    }
147
148    // if file to write the pid to is specified, write pid of the child
149    if let Some(pid_file) = &container_args.pid_file {
150        if let Err(err) = fs::write(pid_file, format!("{init_pid}")) {
151            tracing::warn!("failed to write pid to file: {err}");
152        }
153    }
154
155    if matches!(container_args.container_type, ContainerType::InitContainer) {
156        if let Some(hooks) = container_args.spec.hooks() {
157            main_receiver.wait_for_hook_request()?;
158            if let Some(container_for_hooks) = &container_args.container {
159                hooks::run_hooks(
160                    hooks.prestart().as_ref(),
161                    Some(&container_for_hooks.state),
162                    None,
163                    Some(init_pid),
164                )
165                .map_err(|err| {
166                    tracing::error!("failed to run prestart hooks: {}", err);
167                    err
168                })?;
169
170                hooks::run_hooks(
171                    hooks.create_runtime().as_ref(),
172                    Some(&container_for_hooks.state),
173                    None,
174                    Some(init_pid),
175                )
176                .map_err(|err| {
177                    tracing::error!("failed to run create runtime hooks: {}", err);
178                    err
179                })?;
180            }
181            init_sender.hook_done()?;
182        }
183    }
184
185    if let Some(linux) = container_args.spec.linux() {
186        move_network_devices_to_container(linux, init_pid, &mut main_receiver, &mut init_sender)?;
187
188        #[cfg(feature = "libseccomp")]
189        if let Some(seccomp) = linux.seccomp() {
190            let container = container_args
191                .container
192                .as_ref()
193                .ok_or(ProcessError::ContainerStateRequired)?;
194
195            // Determine OCI status based on container type (matching runc behavior)
196            let oci_status = match container_args.container_type {
197                ContainerType::InitContainer => oci_spec::runtime::ContainerState::Creating,
198                ContainerType::TenantContainer { .. } => oci_spec::runtime::ContainerState::Running,
199            };
200
201            // Build OCI-compliant ContainerProcessState using builder pattern
202            let oci_state = oci_spec::runtime::StateBuilder::default()
203                .version(OCI_VERSION)
204                .id(container.state.id.clone())
205                .status(oci_status)
206                .pid(init_pid.as_raw())
207                .bundle(container.state.bundle.clone())
208                .annotations(container.state.annotations.clone().unwrap_or_default())
209                .build()
210                .map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
211
212            let state = oci_spec::runtime::ContainerProcessStateBuilder::default()
213                .version(OCI_VERSION)
214                .fds(vec![SECCOMP_FD_NAME.to_string()])
215                .pid(init_pid.as_raw())
216                .metadata(seccomp.listener_metadata().clone().unwrap_or_default())
217                .state(oci_state)
218                .build()
219                .map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
220            crate::process::seccomp_listener::sync_seccomp(
221                seccomp,
222                &state,
223                &mut init_sender,
224                &mut main_receiver,
225            )?;
226        }
227    }
228
229    // We don't need to send anything to the init process after this point, so
230    // close the sender.
231    init_sender.close().map_err(|err| {
232        tracing::error!("failed to close unused init sender: {}", err);
233        err
234    })?;
235
236    main_receiver.wait_for_init_ready().map_err(|err| {
237        tracing::error!("failed to wait for init ready: {}", err);
238        err
239    })?;
240
241    tracing::debug!("init pid is {:?}", init_pid);
242
243    // Close the receiver ends to avoid leaking file descriptors.
244
245    inter_receiver.close().map_err(|err| {
246        tracing::error!("failed to close intermediate process receiver: {}", err);
247        err
248    })?;
249
250    init_receiver.close().map_err(|err| {
251        tracing::error!("failed to close init process receiver: {}", err);
252        err
253    })?;
254
255    main_receiver.close().map_err(|err| {
256        tracing::error!("failed to close main process receiver: {}", err);
257        err
258    })?;
259
260    // Before the main process returns, we want to make sure the intermediate
261    // process is exit and reaped. By this point, the intermediate process
262    // should already exited successfully. If intermediate process errors out,
263    // the `init_ready` will not be sent.
264    match waitpid(intermediate_pid, None) {
265        Ok(WaitStatus::Exited(_, 0)) => (),
266        Ok(WaitStatus::Exited(_, s)) => {
267            tracing::warn!("intermediate process failed with exit status: {s}");
268        }
269        Ok(WaitStatus::Signaled(_, sig, _)) => {
270            tracing::warn!("intermediate process killed with signal: {sig}")
271        }
272        Ok(_) => (),
273        Err(nix::errno::Errno::ECHILD) => {
274            // This is safe because intermediate_process and main_process check if the process is
275            // finished by piping instead of exit code.
276            tracing::warn!("intermediate process already reaped");
277        }
278        Err(err) => return Err(ProcessError::WaitIntermediateProcess(err)),
279    };
280
281    Ok((init_pid, need_to_clean_up_intel_rdt_subdirectory))
282}
283
284fn setup_mapping(config: &UserNamespaceConfig, pid: Pid) -> Result<()> {
285    tracing::debug!("write mapping for pid {:?}", pid);
286    // CVE-2014-8989 requires "deny" before a direct write to gid_map from
287    // an unprivileged process. The newuidmap/newgidmap binaries (with
288    // cap_setuid/cap_setgid) handle setgroups themselves, so we must NOT
289    // pre-write "deny" on that path -- once "deny" is set, the kernel
290    // refuses to flip it back to "allow" without CAP_SYS_ADMIN in the
291    // parent userns, and any setgroups() call inside the container fails.
292    if !config.privileged && config.newuidmap.is_none() && config.newgidmap.is_none() {
293        std::fs::write(format!("/proc/{pid}/setgroups"), "deny")
294            .map_err(ProcessError::SetGroupsDeny)?;
295    }
296
297    config.write_uid_mapping(pid).map_err(|err| {
298        tracing::error!("failed to write uid mapping for pid {:?}: {}", pid, err);
299        err
300    })?;
301    config.write_gid_mapping(pid).map_err(|err| {
302        tracing::error!("failed to write gid mapping for pid {:?}: {}", pid, err);
303        err
304    })?;
305    Ok(())
306}
307
308/// Moves configured network devices from the host to the container's network namespace.
309/// This function waits for the init process to join its namespace, then transfers each
310/// configured device while preserving network addresses. Returns early if the container
311/// runs in the host network namespace.
312fn move_network_devices_to_container(
313    linux: &Linux,
314    init_pid: Pid,
315    main_receiver: &mut channel::MainReceiver,
316    init_sender: &mut channel::InitSender,
317) -> Result<()> {
318    // Early return if there are no network devices to move
319    let devices = match linux.net_devices() {
320        Some(devs) if !devs.is_empty() => devs,
321        _ => return Ok(()),
322    };
323
324    if let Some(namespaces) = linux.namespaces() {
325        // network devices are not moved for containers running in the host network.
326        let net_ns = match namespaces
327            .iter()
328            .find(|ns| ns.typ() == LinuxNamespaceType::Network)
329        {
330            Some(ns) => ns,
331            None => return Ok(()),
332        };
333
334        // Wait for the init process to signal that it has joined the network namespace
335        // and is ready for network device setup
336        main_receiver.wait_for_network_setup_ready()?;
337
338        // the container init process has already joined the provided net namespace,
339        // so we can use the process's net ns path directly.
340        let default_ns_path = PathBuf::from(format!("/proc/{}/ns/net", init_pid.as_raw()));
341        let ns_path = net_ns.path().as_deref().unwrap_or(&default_ns_path);
342
343        // Open the network namespace file and validate it exists before moving devices
344        let netns_file = File::open(ns_path).map_err(|err| {
345            tracing::error!(
346                "failed to open network namespace at {}: {}",
347                ns_path.display(),
348                err
349            );
350            ProcessError::Network(err.into())
351        })?;
352        let netns_fd = netns_file.as_raw_fd();
353
354        // If moving any of the network devices fails, we return an error immediately.
355        // The runtime spec requires that the kernel handles moving back any devices
356        // that were successfully moved before the failure occurred.
357        // See: https://github.com/opencontainers/runtime-spec/blob/27cb0027fd92ef81eda1ea3a8153b8337f56d94a/config-linux.md#namespace-lifecycle-and-container-termination
358        let addrs_map = devices
359            .iter()
360            .map(|(name, net_dev)| {
361                let addrs = dev_change_net_namespace(name, netns_fd, net_dev).map_err(|err| {
362                    tracing::error!("failed to dev_change_net_namespace: {}", err);
363                    err
364                })?;
365                Ok((name.clone(), addrs))
366            })
367            .collect::<Result<HashMap<String, Vec<crate::network::cidr::CidrAddress>>>>()?;
368        init_sender.move_network_device(addrs_map)?;
369    }
370
371    Ok(())
372}
373
374#[cfg(test)]
375mod tests {
376    use std::fs;
377
378    use anyhow::Result;
379    use nix::sched::{CloneFlags, unshare};
380    use nix::unistd::{self, getgid, getuid};
381    use oci_spec::runtime::LinuxIdMappingBuilder;
382    use serial_test::serial;
383
384    use super::*;
385    use crate::process::channel::{intermediate_channel, main_channel};
386    use crate::user_ns::UserNamespaceIDMapper;
387
388    #[test]
389    #[serial]
390    fn setup_uid_mapping_should_succeed() -> Result<()> {
391        let uid_mapping = LinuxIdMappingBuilder::default()
392            .host_id(getuid())
393            .container_id(0u32)
394            .size(1u32)
395            .build()?;
396        let uid_mappings = vec![uid_mapping];
397        let tmp = tempfile::tempdir()?;
398        let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
399        let ns_config = UserNamespaceConfig {
400            uid_mappings: Some(uid_mappings),
401            privileged: true,
402            id_mapper: id_mapper.clone(),
403            ..Default::default()
404        };
405        let (mut parent_sender, mut parent_receiver) = main_channel()?;
406        let (mut child_sender, mut child_receiver) = intermediate_channel()?;
407        match unsafe { unistd::fork()? } {
408            unistd::ForkResult::Parent { child } => {
409                parent_receiver.wait_for_mapping_request()?;
410                parent_receiver.close()?;
411
412                // In test, we fake the uid path in /proc/{pid}/uid_map, so we
413                // need to ensure the path exists before we write the mapping.
414                // The path requires the pid we use, so we can only do do after
415                // obtaining the child pid here.
416                id_mapper.ensure_uid_path(&child)?;
417                setup_mapping(&ns_config, child)?;
418                let line = fs::read_to_string(id_mapper.get_uid_path(&child))?;
419                let split_lines = line.split_whitespace();
420                for (act, expect) in split_lines.zip([
421                    uid_mapping.container_id().to_string(),
422                    uid_mapping.host_id().to_string(),
423                    uid_mapping.size().to_string(),
424                ]) {
425                    assert_eq!(act, expect);
426                }
427                child_sender.mapping_written()?;
428                child_sender.close()?;
429            }
430            unistd::ForkResult::Child => {
431                prctl::set_dumpable(true).unwrap();
432                unshare(CloneFlags::CLONE_NEWUSER)?;
433                parent_sender.identifier_mapping_request()?;
434                parent_sender.close()?;
435                child_receiver.wait_for_mapping_ack()?;
436                child_receiver.close()?;
437                std::process::exit(0);
438            }
439        }
440        Ok(())
441    }
442
443    #[test]
444    #[serial]
445    fn setup_gid_mapping_should_succeed() -> Result<()> {
446        let gid_mapping = LinuxIdMappingBuilder::default()
447            .host_id(getgid())
448            .container_id(0u32)
449            .size(1u32)
450            .build()?;
451        let gid_mappings = vec![gid_mapping];
452        let tmp = tempfile::tempdir()?;
453        let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
454        let ns_config = UserNamespaceConfig {
455            gid_mappings: Some(gid_mappings),
456            id_mapper: id_mapper.clone(),
457            ..Default::default()
458        };
459        let (mut parent_sender, mut parent_receiver) = main_channel()?;
460        let (mut child_sender, mut child_receiver) = intermediate_channel()?;
461        match unsafe { unistd::fork()? } {
462            unistd::ForkResult::Parent { child } => {
463                parent_receiver.wait_for_mapping_request()?;
464                parent_receiver.close()?;
465
466                // In test, we fake the gid path in /proc/{pid}/gid_map, so we
467                // need to ensure the path exists before we write the mapping.
468                // The path requires the pid we use, so we can only do do after
469                // obtaining the child pid here.
470                id_mapper.ensure_gid_path(&child)?;
471                setup_mapping(&ns_config, child)?;
472                let line = fs::read_to_string(id_mapper.get_gid_path(&child))?;
473                let split_lines = line.split_whitespace();
474                for (act, expect) in split_lines.zip([
475                    gid_mapping.container_id().to_string(),
476                    gid_mapping.host_id().to_string(),
477                    gid_mapping.size().to_string(),
478                ]) {
479                    assert_eq!(act, expect);
480                }
481                assert_eq!(
482                    fs::read_to_string(format!("/proc/{}/setgroups", child.as_raw()))?,
483                    "deny\n",
484                );
485                child_sender.mapping_written()?;
486                child_sender.close()?;
487            }
488            unistd::ForkResult::Child => {
489                prctl::set_dumpable(true).unwrap();
490                unshare(CloneFlags::CLONE_NEWUSER)?;
491                parent_sender.identifier_mapping_request()?;
492                parent_sender.close()?;
493                child_receiver.wait_for_mapping_ack()?;
494                child_receiver.close()?;
495                std::process::exit(0);
496            }
497        }
498        Ok(())
499    }
500}