1use std::os::fd::FromRawFd;
2use std::path::Path;
3
4use nix::unistd::{Gid, Pid, Uid, close, getpid, write};
5use oci_spec::runtime::{LinuxNamespace, LinuxNamespaceType, LinuxResources};
6use pathrs::flags::OpenFlags;
7use pathrs::procfs::{ProcfsBase, ProcfsHandle};
8use procfs::{FromRead, ProcessCGroups};
9
10use super::args::{ContainerArgs, ContainerType};
11use super::channel::{IntermediateReceiver, MainSender};
12use super::fork::CloneCb;
13use super::init::process as init_process;
14use crate::error::MissingSpecError;
15use crate::namespaces::Namespaces;
16use crate::process::{channel, cpu_affinity, fork};
17
18#[derive(Debug, thiserror::Error)]
19pub enum IntermediateProcessError {
20 #[error(transparent)]
21 Channel(#[from] channel::ChannelError),
22 #[error(transparent)]
23 Namespace(#[from] crate::namespaces::NamespaceError),
24 #[error(transparent)]
25 Syscall(#[from] crate::syscall::SyscallError),
26 #[error("failed to launch init process")]
27 InitProcess(#[source] fork::CloneError),
28 #[error("cgroup error: {0}")]
29 Cgroup(String),
30 #[error(transparent)]
31 Procfs(#[from] procfs::ProcError),
32 #[error(transparent)]
33 Pathrs(#[from] pathrs::error::Error),
34 #[error("exec notify failed")]
35 ExecNotify(#[source] nix::Error),
36 #[error(transparent)]
37 MissingSpec(#[from] crate::error::MissingSpecError),
38 #[error("CPU affinity error {0}")]
39 CpuAffinity(#[from] cpu_affinity::CPUAffinityError),
40 #[error("other error")]
41 Other(String),
42}
43
44type Result<T> = std::result::Result<T, IntermediateProcessError>;
45
46pub fn container_intermediate_process(
47 args: &ContainerArgs,
48 intermediate_chan: &mut (channel::IntermediateSender, channel::IntermediateReceiver),
49 init_chan: &mut (channel::InitSender, channel::InitReceiver),
50 main_sender: &mut channel::MainSender,
51) -> Result<()> {
52 let (inter_sender, inter_receiver) = intermediate_chan;
53 let (init_sender, init_receiver) = init_chan;
54 let command = args.syscall.create_syscall();
55 let spec = &args.spec;
56 let linux = spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
57 let namespaces = Namespaces::try_from(linux.namespaces().as_ref())?;
58 let cgroup_manager = libcgroups::common::create_cgroup_manager(args.cgroup_config.to_owned())
59 .map_err(|e| IntermediateProcessError::Cgroup(e.to_string()))?;
60
61 let current_pid = Pid::this();
62 if matches!(args.container_type, ContainerType::TenantContainer { .. }) {
64 if let Some(exec_cpu_affinity) = spec
65 .process()
66 .as_ref()
67 .and_then(|p| p.exec_cpu_affinity().as_ref())
68 {
69 if let Some(initial) = exec_cpu_affinity.initial() {
70 cpu_affinity::set_cpuset_affinity_from_string(current_pid, initial)?;
71 }
72 }
73 }
74 let _ = cpu_affinity::log_cpu_affinity();
75
76 apply_cgroups(
87 &cgroup_manager,
88 linux.resources().as_ref(),
89 args.container_type,
90 )?;
91
92 if matches!(args.container_type, ContainerType::TenantContainer { .. }) {
94 if let Some(exec_cpu_affinity) = spec
95 .process()
96 .as_ref()
97 .and_then(|p| p.exec_cpu_affinity().as_ref())
98 {
99 if let Some(cpu_affinity_final) = exec_cpu_affinity.cpu_affinity_final() {
100 cpu_affinity::set_cpuset_affinity_from_string(current_pid, cpu_affinity_final)?;
101 }
102 }
103 }
104
105 if let Some(user_namespace) = namespaces.get(LinuxNamespaceType::User)? {
110 setup_userns(&namespaces, user_namespace, main_sender, inter_receiver)?;
111
112 command.set_id(Uid::from_raw(0), Gid::from_raw(0))?;
119 }
120
121 let proc = spec.process().as_ref().ok_or(MissingSpecError::Process)?;
123 if let Some(rlimits) = proc.rlimits() {
124 for rlimit in rlimits {
125 command.set_rlimit(rlimit).map_err(|err| {
126 tracing::error!(?err, ?rlimit, "failed to set rlimit");
127 err
128 })?;
129 }
130 }
131
132 if let Some(pid_namespace) = namespaces.get(LinuxNamespaceType::Pid)? {
134 namespaces.unshare_or_setns(pid_namespace)?;
135 }
136
137 let cb: CloneCb = {
138 Box::new(|| {
139 if let Err(ret) = prctl::set_name("youki:[2:INIT]") {
140 tracing::error!(?ret, "failed to set name for child process");
141 return ret;
142 }
143
144 if let Err(err) = init_sender.close() {
148 tracing::error!(?err, "failed to close receiver in init process");
149 return -1;
150 }
151 if let Err(err) = inter_sender.close() {
152 tracing::error!(?err, "failed to close sender in the intermediate process");
153 return -1;
154 }
155 match init_process::container_init_process(args, main_sender, init_receiver) {
156 Ok(_) => 0,
157 Err(e) => {
158 tracing::error!("failed to initialize container process: {e}");
159 if let Err(err) = main_sender.exec_failed(e.to_string()) {
160 tracing::error!(?err, "failed sending error to main sender");
161 }
162 if let ContainerType::TenantContainer { exec_notify_fd, .. } =
163 args.container_type
164 {
165 let buf = format!("{e}");
166 let exec_notify_fd =
167 unsafe { std::os::fd::OwnedFd::from_raw_fd(exec_notify_fd) };
168 if let Err(err) = write(&exec_notify_fd, buf.as_bytes()) {
169 tracing::error!(?err, "failed to write to exec notify fd");
170 }
171
172 drop(exec_notify_fd);
175 }
176 -1
177 }
178 }
179 })
180 };
181
182 let pid = fork::container_clone_sibling(cb).map_err(|err| {
191 tracing::error!("failed to fork init process: {}", err);
192 IntermediateProcessError::InitProcess(err)
193 })?;
194
195 if let ContainerType::TenantContainer { exec_notify_fd, .. } = args.container_type {
197 close(exec_notify_fd).map_err(|err| {
198 tracing::error!("failed to close exec notify fd: {}", err);
199 IntermediateProcessError::ExecNotify(err)
200 })?;
201 }
202
203 main_sender.intermediate_ready(pid).map_err(|err| {
204 tracing::error!("failed to wait on intermediate process: {}", err);
205 err
206 })?;
207
208 main_sender.close().map_err(|err| {
210 tracing::error!("failed to close unused main sender: {}", err);
211 err
212 })?;
213 inter_sender.close().map_err(|err| {
214 tracing::error!(
215 "failed to close sender in the intermediate process: {}",
216 err
217 );
218 err
219 })?;
220 init_sender.close().map_err(|err| {
221 tracing::error!("failed to close unused init sender: {}", err);
222 err
223 })?;
224
225 Ok(())
226}
227
228fn setup_userns(
229 namespaces: &Namespaces,
230 user_namespace: &LinuxNamespace,
231 sender: &mut MainSender,
232 receiver: &mut IntermediateReceiver,
233) -> Result<()> {
234 namespaces.unshare_or_setns(user_namespace)?;
235 if user_namespace.path().is_some() {
236 return Ok(());
237 }
238
239 tracing::debug!("creating new user namespace");
240 prctl::set_dumpable(true).map_err(|e| {
243 IntermediateProcessError::Other(format!(
244 "error in setting dumpable to true : {}",
245 nix::errno::Errno::from_raw(e)
246 ))
247 })?;
248 sender.identifier_mapping_request().map_err(|err| {
249 tracing::error!("failed to send id mapping request: {}", err);
250 err
251 })?;
252 receiver.wait_for_mapping_ack().map_err(|err| {
253 tracing::error!("failed to receive id mapping ack: {}", err);
254 err
255 })?;
256 prctl::set_dumpable(false).map_err(|e| {
257 IntermediateProcessError::Other(format!(
258 "error in setting dumplable to false : {}",
259 nix::errno::Errno::from_raw(e)
260 ))
261 })?;
262 Ok(())
263}
264
265fn is_ebusy<E: std::error::Error + Send + Sync + 'static>(_err: &E) -> bool {
266 #[cfg(not(feature = "systemd"))]
267 {
268 false
269 }
270
271 #[cfg(feature = "systemd")]
272 {
273 matches!(
274 (_err as &(dyn std::error::Error + 'static)).downcast_ref::<libcgroups::common::AnyManagerError>(),
275 Some(libcgroups::common::AnyManagerError::Systemd(e)) if e.is_ebusy()
276 )
277 }
278}
279
280fn apply_cgroups<
281 C: libcgroups::common::CgroupManager<Error = E> + ?Sized,
282 E: std::error::Error + Send + Sync + 'static,
283>(
284 cmanager: &C,
285 resources: Option<&LinuxResources>,
286 container_type: ContainerType,
287) -> Result<()> {
288 let init = matches!(container_type, ContainerType::InitContainer);
289
290 let pid = getpid();
291 if let Err(err) = cmanager.add_task(pid) {
292 if !init && is_ebusy(&err) {
293 tracing::debug!(
296 "failed to add task to cgroup, trying to join parent's init process cgroup"
297 );
298
299 if let ContainerType::TenantContainer {
300 exec_notify_fd: _,
301 landlord_init_pid,
302 } = container_type
303 && let Some(landlord_init_pid) = landlord_init_pid
304 && let Some(landlord_init_proc_cgroup) =
305 ProcessCGroups::from_read(ProcfsHandle::new()?.open(
306 ProcfsBase::ProcPid(landlord_init_pid.as_raw() as u32),
307 "cgroup",
308 OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC,
309 )?)?
310 .into_iter()
311 .find(|c| c.controllers.is_empty())
312 && let Some(landlord_init_proc_cgroup_path) =
313 landlord_init_proc_cgroup.pathname.strip_prefix("/")
314 {
315 libcgroups::common::write_cgroup_file(
316 Path::new(libcgroups::common::DEFAULT_CGROUP_ROOT)
317 .join(Path::new(landlord_init_proc_cgroup_path))
318 .join(libcgroups::common::CGROUP_PROCS),
319 pid,
320 )
321 .map_err(|err| IntermediateProcessError::Cgroup(err.to_string()))?;
322 return Ok(());
323 }
324 }
325
326 tracing::error!(?pid, ?err, ?init, "failed to add task to cgroup");
327 return Err(IntermediateProcessError::Cgroup(err.to_string()));
328 }
329
330 if let Some(resources) = resources {
331 if init {
332 let controller_opt = libcgroups::common::ControllerOpt {
333 resources,
334 freezer_state: None,
335 oom_score_adj: None,
336 disable_oom_killer: false,
337 };
338
339 cmanager.apply(&controller_opt).map_err(|err| {
340 tracing::error!(?pid, ?err, ?init, "failed to apply cgroup");
341 IntermediateProcessError::Cgroup(err.to_string())
342 })?;
343 }
344 }
345
346 Ok(())
347}
348
349#[cfg(test)]
350mod tests {
351 use anyhow::Result;
352 use libcgroups::test_manager::TestManager;
353 use nix::unistd::Pid;
354 use oci_spec::runtime::LinuxResources;
355 use procfs::process::Process;
356
357 use super::*;
358
359 #[test]
360 fn apply_cgroup_init() -> Result<()> {
361 let cmanager = TestManager::default();
363 let resources = LinuxResources::default();
364
365 apply_cgroups(&cmanager, Some(&resources), ContainerType::InitContainer)?;
367
368 assert!(cmanager.get_add_task_args().len() == 1);
370 assert_eq!(
371 cmanager.get_add_task_args()[0],
372 Pid::from_raw(Process::myself()?.pid())
373 );
374 assert!(cmanager.apply_called());
375 Ok(())
376 }
377
378 #[test]
379 fn apply_cgroup_tenant() -> Result<()> {
380 let cmanager = TestManager::default();
382 let resources = LinuxResources::default();
383
384 apply_cgroups(
386 &cmanager,
387 Some(&resources),
388 ContainerType::TenantContainer {
389 exec_notify_fd: 0,
390 landlord_init_pid: None,
391 },
392 )?;
393
394 assert_eq!(
396 cmanager.get_add_task_args()[0],
397 Pid::from_raw(Process::myself()?.pid())
398 );
399 assert!(!cmanager.apply_called());
400 Ok(())
401 }
402
403 #[test]
404 fn apply_cgroup_no_resources() -> Result<()> {
405 let cmanager = TestManager::default();
407
408 apply_cgroups(&cmanager, None, ContainerType::InitContainer)?;
410 assert_eq!(
412 cmanager.get_add_task_args()[0],
413 Pid::from_raw(Process::myself()?.pid())
414 );
415 assert!(!cmanager.apply_called());
416 Ok(())
417 }
418}