1use std::collections::HashMap;
2use std::fs;
3use std::fs::File;
4use std::os::fd::AsRawFd;
5use std::path::PathBuf;
6
7use nix::sys::wait::{WaitStatus, waitpid};
8use nix::unistd::Pid;
9use oci_spec::runtime::{Linux, LinuxNamespaceType};
10#[cfg(feature = "libseccomp")]
11use oci_spec::runtime::{SECCOMP_FD_NAME, VERSION as OCI_VERSION};
12
13use crate::hooks;
14use crate::network::network_device::dev_change_net_namespace;
15use crate::process::args::{ContainerArgs, ContainerType};
16use crate::process::fork::{self, CloneCb};
17use crate::process::intel_rdt::setup_intel_rdt;
18use crate::process::{channel, container_intermediate_process};
19use crate::syscall::SyscallError;
20use crate::user_ns::UserNamespaceConfig;
21
22#[derive(Debug, thiserror::Error)]
23pub enum ProcessError {
24 #[error(transparent)]
25 Channel(#[from] channel::ChannelError),
26 #[error("failed to write deny to setgroups")]
27 SetGroupsDeny(#[source] std::io::Error),
28 #[error(transparent)]
29 UserNamespace(#[from] crate::user_ns::UserNamespaceError),
30 #[error("container state is required")]
31 ContainerStateRequired,
32 #[error("failed to wait for intermediate process")]
33 WaitIntermediateProcess(#[source] nix::Error),
34 #[error(transparent)]
35 IntelRdt(#[from] crate::process::intel_rdt::IntelRdtError),
36 #[error("failed to create intermediate process")]
37 IntermediateProcessFailed(#[source] fork::CloneError),
38 #[error("failed seccomp listener")]
39 #[cfg(feature = "libseccomp")]
40 SeccompListener(#[from] crate::process::seccomp_listener::SeccompListenerError),
41 #[error("failed setup network device")]
42 Network(#[from] crate::network::NetworkError),
43 #[error("failed syscall")]
44 SyscallOther(#[source] SyscallError),
45 #[error("failed hooks {0}")]
46 Hooks(#[from] crate::hooks::HookError),
47 #[error("failed to build OCI state: {0}")]
48 OciStateBuild(String),
49}
50
51type Result<T> = std::result::Result<T, ProcessError>;
52
53pub fn container_main_process(container_args: &ContainerArgs) -> Result<(Pid, bool)> {
54 let (mut main_sender, mut main_receiver) = channel::main_channel()?;
60 let mut inter_chan = channel::intermediate_channel()?;
61 let mut init_chan = channel::init_channel()?;
62
63 let cb: CloneCb = {
64 Box::new(|| {
65 if let Err(ret) = prctl::set_name("youki:[1:INTER]") {
66 tracing::error!(?ret, "failed to set name for child process");
67 return ret;
68 }
69
70 match container_intermediate_process::container_intermediate_process(
71 container_args,
72 &mut inter_chan,
73 &mut init_chan,
74 &mut main_sender,
75 ) {
76 Ok(_) => 0,
77 Err(err) => {
78 tracing::error!("failed to run intermediate process {}", err);
79 match main_sender.send_error(err.to_string()) {
80 Ok(_) => {}
81 Err(e) => {
82 tracing::error!(
83 "error in sending intermediate error message {} to main: {}",
84 err,
85 e
86 )
87 }
88 }
89 -1
90 }
91 }
92 })
93 };
94
95 let container_clone_fn = if container_args.as_sibling {
96 fork::container_clone_sibling
97 } else {
98 fork::container_clone
99 };
100
101 let intermediate_pid = container_clone_fn(cb).map_err(|err| {
102 tracing::error!("failed to fork intermediate process: {}", err);
103 ProcessError::IntermediateProcessFailed(err)
104 })?;
105
106 main_sender.close().map_err(|err| {
109 tracing::error!("failed to close unused sender: {}", err);
110 err
111 })?;
112
113 let (mut inter_sender, inter_receiver) = inter_chan;
114 let (mut init_sender, init_receiver) = init_chan;
115
116 if let Some(config) = &container_args.user_ns_config {
120 main_receiver.wait_for_mapping_request()?;
121 setup_mapping(config, intermediate_pid)?;
122 inter_sender.mapping_written()?;
123 }
124
125 inter_sender.close().map_err(|err| {
128 tracing::error!("failed to close unused intermediate sender: {}", err);
129 err
130 })?;
131
132 let init_pid = main_receiver.wait_for_intermediate_ready()?;
135 let mut need_to_clean_up_intel_rdt_subdirectory = false;
136
137 if let Some(linux) = container_args.spec.linux() {
138 if let Some(intel_rdt) = linux.intel_rdt() {
139 let container_id = container_args
140 .container
141 .as_ref()
142 .map(|container| container.id());
143 need_to_clean_up_intel_rdt_subdirectory =
144 setup_intel_rdt(container_id, &init_pid, intel_rdt)?;
145 }
146 }
147
148 if let Some(pid_file) = &container_args.pid_file {
150 if let Err(err) = fs::write(pid_file, format!("{init_pid}")) {
151 tracing::warn!("failed to write pid to file: {err}");
152 }
153 }
154
155 if matches!(container_args.container_type, ContainerType::InitContainer) {
156 if let Some(hooks) = container_args.spec.hooks() {
157 main_receiver.wait_for_hook_request()?;
158 if let Some(container_for_hooks) = &container_args.container {
159 hooks::run_hooks(
160 hooks.prestart().as_ref(),
161 Some(&container_for_hooks.state),
162 None,
163 Some(init_pid),
164 )
165 .map_err(|err| {
166 tracing::error!("failed to run prestart hooks: {}", err);
167 err
168 })?;
169
170 hooks::run_hooks(
171 hooks.create_runtime().as_ref(),
172 Some(&container_for_hooks.state),
173 None,
174 Some(init_pid),
175 )
176 .map_err(|err| {
177 tracing::error!("failed to run create runtime hooks: {}", err);
178 err
179 })?;
180 }
181 init_sender.hook_done()?;
182 }
183 }
184
185 if let Some(linux) = container_args.spec.linux() {
186 move_network_devices_to_container(linux, init_pid, &mut main_receiver, &mut init_sender)?;
187
188 #[cfg(feature = "libseccomp")]
189 if let Some(seccomp) = linux.seccomp() {
190 let container = container_args
191 .container
192 .as_ref()
193 .ok_or(ProcessError::ContainerStateRequired)?;
194
195 let oci_status = match container_args.container_type {
197 ContainerType::InitContainer => oci_spec::runtime::ContainerState::Creating,
198 ContainerType::TenantContainer { .. } => oci_spec::runtime::ContainerState::Running,
199 };
200
201 let oci_state = oci_spec::runtime::StateBuilder::default()
203 .version(OCI_VERSION)
204 .id(container.state.id.clone())
205 .status(oci_status)
206 .pid(init_pid.as_raw())
207 .bundle(container.state.bundle.clone())
208 .annotations(container.state.annotations.clone().unwrap_or_default())
209 .build()
210 .map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
211
212 let state = oci_spec::runtime::ContainerProcessStateBuilder::default()
213 .version(OCI_VERSION)
214 .fds(vec![SECCOMP_FD_NAME.to_string()])
215 .pid(init_pid.as_raw())
216 .metadata(seccomp.listener_metadata().clone().unwrap_or_default())
217 .state(oci_state)
218 .build()
219 .map_err(|e| ProcessError::OciStateBuild(e.to_string()))?;
220 crate::process::seccomp_listener::sync_seccomp(
221 seccomp,
222 &state,
223 &mut init_sender,
224 &mut main_receiver,
225 )?;
226 }
227 }
228
229 init_sender.close().map_err(|err| {
232 tracing::error!("failed to close unused init sender: {}", err);
233 err
234 })?;
235
236 main_receiver.wait_for_init_ready().map_err(|err| {
237 tracing::error!("failed to wait for init ready: {}", err);
238 err
239 })?;
240
241 tracing::debug!("init pid is {:?}", init_pid);
242
243 inter_receiver.close().map_err(|err| {
246 tracing::error!("failed to close intermediate process receiver: {}", err);
247 err
248 })?;
249
250 init_receiver.close().map_err(|err| {
251 tracing::error!("failed to close init process receiver: {}", err);
252 err
253 })?;
254
255 main_receiver.close().map_err(|err| {
256 tracing::error!("failed to close main process receiver: {}", err);
257 err
258 })?;
259
260 match waitpid(intermediate_pid, None) {
265 Ok(WaitStatus::Exited(_, 0)) => (),
266 Ok(WaitStatus::Exited(_, s)) => {
267 tracing::warn!("intermediate process failed with exit status: {s}");
268 }
269 Ok(WaitStatus::Signaled(_, sig, _)) => {
270 tracing::warn!("intermediate process killed with signal: {sig}")
271 }
272 Ok(_) => (),
273 Err(nix::errno::Errno::ECHILD) => {
274 tracing::warn!("intermediate process already reaped");
277 }
278 Err(err) => return Err(ProcessError::WaitIntermediateProcess(err)),
279 };
280
281 Ok((init_pid, need_to_clean_up_intel_rdt_subdirectory))
282}
283
284fn setup_mapping(config: &UserNamespaceConfig, pid: Pid) -> Result<()> {
285 tracing::debug!("write mapping for pid {:?}", pid);
286 if !config.privileged && config.newuidmap.is_none() && config.newgidmap.is_none() {
293 std::fs::write(format!("/proc/{pid}/setgroups"), "deny")
294 .map_err(ProcessError::SetGroupsDeny)?;
295 }
296
297 config.write_uid_mapping(pid).map_err(|err| {
298 tracing::error!("failed to write uid mapping for pid {:?}: {}", pid, err);
299 err
300 })?;
301 config.write_gid_mapping(pid).map_err(|err| {
302 tracing::error!("failed to write gid mapping for pid {:?}: {}", pid, err);
303 err
304 })?;
305 Ok(())
306}
307
308fn move_network_devices_to_container(
313 linux: &Linux,
314 init_pid: Pid,
315 main_receiver: &mut channel::MainReceiver,
316 init_sender: &mut channel::InitSender,
317) -> Result<()> {
318 let devices = match linux.net_devices() {
320 Some(devs) if !devs.is_empty() => devs,
321 _ => return Ok(()),
322 };
323
324 if let Some(namespaces) = linux.namespaces() {
325 let net_ns = match namespaces
327 .iter()
328 .find(|ns| ns.typ() == LinuxNamespaceType::Network)
329 {
330 Some(ns) => ns,
331 None => return Ok(()),
332 };
333
334 main_receiver.wait_for_network_setup_ready()?;
337
338 let default_ns_path = PathBuf::from(format!("/proc/{}/ns/net", init_pid.as_raw()));
341 let ns_path = net_ns.path().as_deref().unwrap_or(&default_ns_path);
342
343 let netns_file = File::open(ns_path).map_err(|err| {
345 tracing::error!(
346 "failed to open network namespace at {}: {}",
347 ns_path.display(),
348 err
349 );
350 ProcessError::Network(err.into())
351 })?;
352 let netns_fd = netns_file.as_raw_fd();
353
354 let addrs_map = devices
359 .iter()
360 .map(|(name, net_dev)| {
361 let addrs = dev_change_net_namespace(name, netns_fd, net_dev).map_err(|err| {
362 tracing::error!("failed to dev_change_net_namespace: {}", err);
363 err
364 })?;
365 Ok((name.clone(), addrs))
366 })
367 .collect::<Result<HashMap<String, Vec<crate::network::cidr::CidrAddress>>>>()?;
368 init_sender.move_network_device(addrs_map)?;
369 }
370
371 Ok(())
372}
373
374#[cfg(test)]
375mod tests {
376 use std::fs;
377
378 use anyhow::Result;
379 use nix::sched::{CloneFlags, unshare};
380 use nix::unistd::{self, getgid, getuid};
381 use oci_spec::runtime::LinuxIdMappingBuilder;
382 use serial_test::serial;
383
384 use super::*;
385 use crate::process::channel::{intermediate_channel, main_channel};
386 use crate::user_ns::UserNamespaceIDMapper;
387
388 #[test]
389 #[serial]
390 fn setup_uid_mapping_should_succeed() -> Result<()> {
391 let uid_mapping = LinuxIdMappingBuilder::default()
392 .host_id(getuid())
393 .container_id(0u32)
394 .size(1u32)
395 .build()?;
396 let uid_mappings = vec![uid_mapping];
397 let tmp = tempfile::tempdir()?;
398 let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
399 let ns_config = UserNamespaceConfig {
400 uid_mappings: Some(uid_mappings),
401 privileged: true,
402 id_mapper: id_mapper.clone(),
403 ..Default::default()
404 };
405 let (mut parent_sender, mut parent_receiver) = main_channel()?;
406 let (mut child_sender, mut child_receiver) = intermediate_channel()?;
407 match unsafe { unistd::fork()? } {
408 unistd::ForkResult::Parent { child } => {
409 parent_receiver.wait_for_mapping_request()?;
410 parent_receiver.close()?;
411
412 id_mapper.ensure_uid_path(&child)?;
417 setup_mapping(&ns_config, child)?;
418 let line = fs::read_to_string(id_mapper.get_uid_path(&child))?;
419 let split_lines = line.split_whitespace();
420 for (act, expect) in split_lines.zip([
421 uid_mapping.container_id().to_string(),
422 uid_mapping.host_id().to_string(),
423 uid_mapping.size().to_string(),
424 ]) {
425 assert_eq!(act, expect);
426 }
427 child_sender.mapping_written()?;
428 child_sender.close()?;
429 }
430 unistd::ForkResult::Child => {
431 prctl::set_dumpable(true).unwrap();
432 unshare(CloneFlags::CLONE_NEWUSER)?;
433 parent_sender.identifier_mapping_request()?;
434 parent_sender.close()?;
435 child_receiver.wait_for_mapping_ack()?;
436 child_receiver.close()?;
437 std::process::exit(0);
438 }
439 }
440 Ok(())
441 }
442
443 #[test]
444 #[serial]
445 fn setup_gid_mapping_should_succeed() -> Result<()> {
446 let gid_mapping = LinuxIdMappingBuilder::default()
447 .host_id(getgid())
448 .container_id(0u32)
449 .size(1u32)
450 .build()?;
451 let gid_mappings = vec![gid_mapping];
452 let tmp = tempfile::tempdir()?;
453 let id_mapper = UserNamespaceIDMapper::new_test(tmp.path().to_path_buf());
454 let ns_config = UserNamespaceConfig {
455 gid_mappings: Some(gid_mappings),
456 id_mapper: id_mapper.clone(),
457 ..Default::default()
458 };
459 let (mut parent_sender, mut parent_receiver) = main_channel()?;
460 let (mut child_sender, mut child_receiver) = intermediate_channel()?;
461 match unsafe { unistd::fork()? } {
462 unistd::ForkResult::Parent { child } => {
463 parent_receiver.wait_for_mapping_request()?;
464 parent_receiver.close()?;
465
466 id_mapper.ensure_gid_path(&child)?;
471 setup_mapping(&ns_config, child)?;
472 let line = fs::read_to_string(id_mapper.get_gid_path(&child))?;
473 let split_lines = line.split_whitespace();
474 for (act, expect) in split_lines.zip([
475 gid_mapping.container_id().to_string(),
476 gid_mapping.host_id().to_string(),
477 gid_mapping.size().to_string(),
478 ]) {
479 assert_eq!(act, expect);
480 }
481 assert_eq!(
482 fs::read_to_string(format!("/proc/{}/setgroups", child.as_raw()))?,
483 "deny\n",
484 );
485 child_sender.mapping_written()?;
486 child_sender.close()?;
487 }
488 unistd::ForkResult::Child => {
489 prctl::set_dumpable(true).unwrap();
490 unshare(CloneFlags::CLONE_NEWUSER)?;
491 parent_sender.identifier_mapping_request()?;
492 parent_sender.close()?;
493 child_receiver.wait_for_mapping_ack()?;
494 child_receiver.close()?;
495 std::process::exit(0);
496 }
497 }
498 Ok(())
499 }
500}