Skip to main content

libcontainer/process/
container_intermediate_process.rs

1use std::os::fd::FromRawFd;
2use std::path::Path;
3
4use nix::unistd::{Gid, Pid, Uid, close, getpid, write};
5use oci_spec::runtime::{LinuxNamespace, LinuxNamespaceType, LinuxResources};
6use pathrs::flags::OpenFlags;
7use pathrs::procfs::{ProcfsBase, ProcfsHandle};
8use procfs::{FromRead, ProcessCGroups};
9
10use super::args::{ContainerArgs, ContainerType};
11use super::channel::{IntermediateReceiver, MainSender};
12use super::fork::CloneCb;
13use super::init::process as init_process;
14use crate::error::MissingSpecError;
15use crate::namespaces::Namespaces;
16use crate::process::{channel, cpu_affinity, fork};
17
18#[derive(Debug, thiserror::Error)]
19pub enum IntermediateProcessError {
20    #[error(transparent)]
21    Channel(#[from] channel::ChannelError),
22    #[error(transparent)]
23    Namespace(#[from] crate::namespaces::NamespaceError),
24    #[error(transparent)]
25    Syscall(#[from] crate::syscall::SyscallError),
26    #[error("failed to launch init process")]
27    InitProcess(#[source] fork::CloneError),
28    #[error("cgroup error: {0}")]
29    Cgroup(String),
30    #[error(transparent)]
31    Procfs(#[from] procfs::ProcError),
32    #[error(transparent)]
33    Pathrs(#[from] pathrs::error::Error),
34    #[error("exec notify failed")]
35    ExecNotify(#[source] nix::Error),
36    #[error(transparent)]
37    MissingSpec(#[from] crate::error::MissingSpecError),
38    #[error("CPU affinity error {0}")]
39    CpuAffinity(#[from] cpu_affinity::CPUAffinityError),
40    #[error("other error")]
41    Other(String),
42}
43
44type Result<T> = std::result::Result<T, IntermediateProcessError>;
45
46pub fn container_intermediate_process(
47    args: &ContainerArgs,
48    intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver),
49    init_chan: &mut (channel::InitSender, channel::InitReceiver),
50    main_sender: &mut channel::MainSender,
51) -> Result<()> {
52    let (inter_sender, inter_receiver) = intermediate_chan;
53    let (init_sender, init_receiver) = init_chan;
54    let command = args.syscall.create_syscall();
55    let spec = &args.spec;
56    let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
57    let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?;
58    let cgroup_manager = libcgroups::common::create_cgroup_manager(args.cgroup_config.to_owned())
59        .map_err(|e| IntermediateProcessError::Cgroup(e.to_string()))?;
60
61    let current_pid = Pid::this();
62    // setting CPU affinity for tenant container before cgroup move
63    if matches!(args.container_type, ContainerType::TenantContainer { .. }) {
64        if let Some(exec_cpu_affinity) = spec
65            .process()
66            .as_ref()
67            .and_then(|p| p.exec_cpu_affinity().as_ref())
68        {
69            if let Some(initial) = exec_cpu_affinity.initial() {
70                cpu_affinity::set_cpuset_affinity_from_string(current_pid, initial)?;
71            }
72        }
73    }
74    let _ = cpu_affinity::log_cpu_affinity();
75
76    // this needs to be done before we create the init process, so that the init
77    // process will already be captured by the cgroup. It also needs to be done
78    // before we enter the user namespace because if a privileged user starts a
79    // rootless container on a cgroup v1 system we can still fulfill resource
80    // restrictions through the cgroup fs support (delegation through systemd is
81    // not supported for v1 by us). This only works if the user has not yet been
82    // mapped to an unprivileged user by the user namespace however.
83    // In addition this needs to be done before we enter the cgroup namespace as
84    // the cgroup of the process will form the root of the cgroup hierarchy in
85    // the cgroup namespace.
86    apply_cgroups(
87        &cgroup_manager,
88        linux.resources().as_ref(),
89        args.container_type,
90    )?;
91
92    // setting CPU affinity for tenant container after cgroup move
93    if matches!(args.container_type, ContainerType::TenantContainer { .. }) {
94        if let Some(exec_cpu_affinity) = spec
95            .process()
96            .as_ref()
97            .and_then(|p| p.exec_cpu_affinity().as_ref())
98        {
99            if let Some(cpu_affinity_final) = exec_cpu_affinity.cpu_affinity_final() {
100                cpu_affinity::set_cpuset_affinity_from_string(current_pid, cpu_affinity_final)?;
101            }
102        }
103    }
104
105    // if new user is specified in specification, this will be true and new
106    // namespace will be created, check
107    // https://man7.org/linux/man-pages/man7/user_namespaces.7.html for more
108    // information
109    if let Some(user_namespace) = namespaces.get(LinuxNamespaceType::User)? {
110        setup_userns(&namespaces, user_namespace, main_sender, inter_receiver)?;
111
112        // After UID and GID mapping is configured correctly in the Youki main
113        // process, We want to make sure continue as the root user inside the
114        // new user namespace. This is required because the process of
115        // configuring the container process will require root, even though the
116        // root in the user namespace likely is mapped to an non-privileged user
117        // on the parent user namespace.
118        command.set_id(Uid::from_raw(0), Gid::from_raw(0))?;
119    }
120
121    // set limits and namespaces to the process
122    let proc = spec.process().as_ref().ok_or(MissingSpecError::Process)?;
123    if let Some(rlimits) = proc.rlimits() {
124        for rlimit in rlimits {
125            command.set_rlimit(rlimit).map_err(|err| {
126                tracing::error!(?err, ?rlimit, "failed to set rlimit");
127                err
128            })?;
129        }
130    }
131
132    // Pid namespace requires an extra fork to enter, so we enter pid namespace now.
133    if let Some(pid_namespace) = namespaces.get(LinuxNamespaceType::Pid)? {
134        namespaces.unshare_or_setns(pid_namespace)?;
135    }
136
137    let cb: CloneCb = {
138        Box::new(|| {
139            if let Err(ret) = prctl::set_name("youki:[2:INIT]") {
140                tracing::error!(?ret, "failed to set name for child process");
141                return ret;
142            }
143
144            // We are inside the forked process here. The first thing we have to do
145            // is to close any unused senders, since fork will make a dup for all
146            // the socket.
147            if let Err(err) = init_sender.close() {
148                tracing::error!(?err, "failed to close receiver in init process");
149                return -1;
150            }
151            if let Err(err) = inter_sender.close() {
152                tracing::error!(?err, "failed to close sender in the intermediate process");
153                return -1;
154            }
155            match init_process::container_init_process(args, main_sender, init_receiver) {
156                Ok(_) => 0,
157                Err(e) => {
158                    tracing::error!("failed to initialize container process: {e}");
159                    if let Err(err) = main_sender.exec_failed(e.to_string()) {
160                        tracing::error!(?err, "failed sending error to main sender");
161                    }
162                    if let ContainerType::TenantContainer { exec_notify_fd, .. } =
163                        args.container_type
164                    {
165                        let buf = format!("{e}");
166                        let exec_notify_fd =
167                            unsafe { std::os::fd::OwnedFd::from_raw_fd(exec_notify_fd) };
168                        if let Err(err) = write(&exec_notify_fd, buf.as_bytes()) {
169                            tracing::error!(?err, "failed to write to exec notify fd");
170                        }
171
172                        // After sending the error through the exec_notify_fd,
173                        // we need to explicitly close the pipe.
174                        drop(exec_notify_fd);
175                    }
176                    -1
177                }
178            }
179        })
180    };
181
182    // We have to record the pid of the init process. The init process will be
183    // inside the pid namespace, so we can't rely on the init process to send us
184    // the correct pid. We also want to clone the init process as a sibling
185    // process to the intermediate process. The intermediate process is only
186    // used as a jumping board to set the init process to the correct
187    // configuration. The youki main process can decide what to do with the init
188    // process and the intermediate process can just exit safely after the job
189    // is done.
190    let pid = fork::container_clone_sibling(cb).map_err(|err| {
191        tracing::error!("failed to fork init process: {}", err);
192        IntermediateProcessError::InitProcess(err)
193    })?;
194
195    // Close the exec_notify_fd in this process
196    if let ContainerType::TenantContainer { exec_notify_fd, .. } = args.container_type {
197        close(exec_notify_fd).map_err(|err| {
198            tracing::error!("failed to close exec notify fd: {}", err);
199            IntermediateProcessError::ExecNotify(err)
200        })?;
201    }
202
203    main_sender.intermediate_ready(pid).map_err(|err| {
204        tracing::error!("failed to wait on intermediate process: {}", err);
205        err
206    })?;
207
208    // Close unused senders here so we don't have lingering socket around.
209    main_sender.close().map_err(|err| {
210        tracing::error!("failed to close unused main sender: {}", err);
211        err
212    })?;
213    inter_sender.close().map_err(|err| {
214        tracing::error!(
215            "failed to close sender in the intermediate process: {}",
216            err
217        );
218        err
219    })?;
220    init_sender.close().map_err(|err| {
221        tracing::error!("failed to close unused init sender: {}", err);
222        err
223    })?;
224
225    Ok(())
226}
227
228fn setup_userns(
229    namespaces: &Namespaces,
230    user_namespace: &LinuxNamespace,
231    sender: &mut MainSender,
232    receiver: &mut IntermediateReceiver,
233) -> Result<()> {
234    namespaces.unshare_or_setns(user_namespace)?;
235    if user_namespace.path().is_some() {
236        return Ok(());
237    }
238
239    tracing::debug!("creating new user namespace");
240    // child needs to be dumpable, otherwise the non root parent is not
241    // allowed to write the uid/gid maps
242    prctl::set_dumpable(true).map_err(|e| {
243        IntermediateProcessError::Other(format!(
244            "error in setting dumpable to true : {}",
245            nix::errno::Errno::from_raw(e)
246        ))
247    })?;
248    sender.identifier_mapping_request().map_err(|err| {
249        tracing::error!("failed to send id mapping request: {}", err);
250        err
251    })?;
252    receiver.wait_for_mapping_ack().map_err(|err| {
253        tracing::error!("failed to receive id mapping ack: {}", err);
254        err
255    })?;
256    prctl::set_dumpable(false).map_err(|e| {
257        IntermediateProcessError::Other(format!(
258            "error in setting dumplable to false : {}",
259            nix::errno::Errno::from_raw(e)
260        ))
261    })?;
262    Ok(())
263}
264
265fn is_ebusy<E: std::error::Error + Send + Sync + 'static>(_err: &E) -> bool {
266    #[cfg(not(feature = "systemd"))]
267    {
268        false
269    }
270
271    #[cfg(feature = "systemd")]
272    {
273        matches!(
274          (_err as &(dyn  std::error::Error + 'static)).downcast_ref::<libcgroups::common::AnyManagerError>(),
275            Some(libcgroups::common::AnyManagerError::Systemd(e)) if e.is_ebusy()
276        )
277    }
278}
279
280fn apply_cgroups<
281    C: libcgroups::common::CgroupManager<Error = E> + ?Sized,
282    E: std::error::Error + Send + Sync + 'static,
283>(
284    cmanager: &C,
285    resources: Option<&LinuxResources>,
286    container_type: ContainerType,
287) -> Result<()> {
288    let init = matches!(container_type, ContainerType::InitContainer);
289
290    let pid = getpid();
291    if let Err(err) = cmanager.add_task(pid) {
292        if !init && is_ebusy(&err) {
293            // If adding the process to the cgroup fails due to a "Device or resource busy" error,
294            // manager tries to join the cgroup of the init process of the parent container.
295            tracing::debug!(
296                "failed to add task to cgroup, trying to join parent's init process cgroup"
297            );
298
299            if let ContainerType::TenantContainer {
300                exec_notify_fd: _,
301                landlord_init_pid,
302            } = container_type
303                && let Some(landlord_init_pid) = landlord_init_pid
304                && let Some(landlord_init_proc_cgroup) =
305                    ProcessCGroups::from_read(ProcfsHandle::new()?.open(
306                        ProcfsBase::ProcPid(landlord_init_pid.as_raw() as u32),
307                        "cgroup",
308                        OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC,
309                    )?)?
310                    .into_iter()
311                    .find(|c| c.controllers.is_empty())
312                && let Some(landlord_init_proc_cgroup_path) =
313                    landlord_init_proc_cgroup.pathname.strip_prefix("/")
314            {
315                libcgroups::common::write_cgroup_file(
316                    Path::new(libcgroups::common::DEFAULT_CGROUP_ROOT)
317                        .join(Path::new(landlord_init_proc_cgroup_path))
318                        .join(libcgroups::common::CGROUP_PROCS),
319                    pid,
320                )
321                .map_err(|err| IntermediateProcessError::Cgroup(err.to_string()))?;
322                return Ok(());
323            }
324        }
325
326        tracing::error!(?pid, ?err, ?init, "failed to add task to cgroup");
327        return Err(IntermediateProcessError::Cgroup(err.to_string()));
328    }
329
330    if let Some(resources) = resources {
331        if init {
332            let controller_opt = libcgroups::common::ControllerOpt {
333                resources,
334                freezer_state: None,
335                oom_score_adj: None,
336                disable_oom_killer: false,
337            };
338
339            cmanager.apply(&controller_opt).map_err(|err| {
340                tracing::error!(?pid, ?err, ?init, "failed to apply cgroup");
341                IntermediateProcessError::Cgroup(err.to_string())
342            })?;
343        }
344    }
345
346    Ok(())
347}
348
349#[cfg(test)]
350mod tests {
351    use anyhow::Result;
352    use libcgroups::test_manager::TestManager;
353    use nix::unistd::Pid;
354    use oci_spec::runtime::LinuxResources;
355    use procfs::process::Process;
356
357    use super::*;
358
359    #[test]
360    fn apply_cgroup_init() -> Result<()> {
361        // arrange
362        let cmanager = TestManager::default();
363        let resources = LinuxResources::default();
364
365        // act
366        apply_cgroups(&cmanager, Some(&resources), ContainerType::InitContainer)?;
367
368        // assert
369        assert!(cmanager.get_add_task_args().len() == 1);
370        assert_eq!(
371            cmanager.get_add_task_args()[0],
372            Pid::from_raw(Process::myself()?.pid())
373        );
374        assert!(cmanager.apply_called());
375        Ok(())
376    }
377
378    #[test]
379    fn apply_cgroup_tenant() -> Result<()> {
380        // arrange
381        let cmanager = TestManager::default();
382        let resources = LinuxResources::default();
383
384        // act
385        apply_cgroups(
386            &cmanager,
387            Some(&resources),
388            ContainerType::TenantContainer {
389                exec_notify_fd: 0,
390                landlord_init_pid: None,
391            },
392        )?;
393
394        // assert
395        assert_eq!(
396            cmanager.get_add_task_args()[0],
397            Pid::from_raw(Process::myself()?.pid())
398        );
399        assert!(!cmanager.apply_called());
400        Ok(())
401    }
402
403    #[test]
404    fn apply_cgroup_no_resources() -> Result<()> {
405        // arrange
406        let cmanager = TestManager::default();
407
408        // act
409        apply_cgroups(&cmanager, None, ContainerType::InitContainer)?;
410        // assert
411        assert_eq!(
412            cmanager.get_add_task_args()[0],
413            Pid::from_raw(Process::myself()?.pid())
414        );
415        assert!(!cmanager.apply_called());
416        Ok(())
417    }
418}