1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
use super::{Container, ContainerStatus};
use crate::{
hooks,
notify_socket::NotifyListener,
process::{
self,
args::{ContainerArgs, ContainerType},
},
rootless::Rootless,
syscall::Syscall,
utils,
};
use anyhow::{bail, Context, Result};
use nix::unistd::Pid;
use oci_spec::runtime::Spec;
use std::{fs, io::Write, os::unix::prelude::RawFd, path::PathBuf};
pub(super) struct ContainerBuilderImpl<'a> {
/// Flag indicating if an init or a tenant container should be created
pub container_type: ContainerType,
/// Interface to operating system primitives
pub syscall: &'a dyn Syscall,
/// Flag indicating if systemd should be used for cgroup management
pub use_systemd: bool,
/// Id of the container
pub container_id: String,
/// OCI complient runtime spec
pub spec: &'a Spec,
/// Root filesystem of the container
pub rootfs: PathBuf,
/// File which will be used to communicate the pid of the
/// container process to the higher level runtime
pub pid_file: Option<PathBuf>,
/// Socket to communicate the file descriptor of the ptty
pub console_socket: Option<RawFd>,
/// Options for rootless containers
pub rootless: Option<Rootless<'a>>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Container state
pub container: Option<Container>,
/// File descriptos preserved/passed to the container init process.
pub preserve_fds: i32,
/// If the container is to be run in detached mode
pub detached: bool,
}
impl<'a> ContainerBuilderImpl<'a> {
pub(super) fn create(&mut self) -> Result<Pid> {
match self.run_container().context("failed to create container") {
Ok(pid) => Ok(pid),
Err(outer) => {
if let Err(inner) = self.cleanup_container() {
return Err(outer.context(inner));
}
Err(outer)
}
}
}
fn run_container(&mut self) -> Result<Pid> {
let linux = self.spec.linux().as_ref().context("no linux in spec")?;
let cgroups_path = utils::get_cgroup_path(
linux.cgroups_path(),
&self.container_id,
self.rootless.is_some(),
);
let cmanager = libcgroups::common::create_cgroup_manager(
&cgroups_path,
self.use_systemd || self.rootless.is_some(),
&self.container_id,
)?;
let process = self.spec.process().as_ref().context("No process in spec")?;
if matches!(self.container_type, ContainerType::InitContainer) {
if let Some(hooks) = self.spec.hooks() {
hooks::run_hooks(hooks.create_runtime().as_ref(), self.container.as_ref())?
}
}
// Need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container. During
// exec, need to create the socket before we enter into existing mount
// namespace.
let notify_socket: NotifyListener = NotifyListener::new(&self.notify_path)?;
// If Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
// https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more
// information.
//
// This has to be done before !dumpable because /proc/self/oom_score_adj
// is not writeable unless you're an privileged user (if !dumpable is
// set). All children inherit their parent's oom_score_adj value on
// fork(2) so this will always be propagated properly.
if let Some(oom_score_adj) = process.oom_score_adj() {
log::debug!("Set OOM score to {}", oom_score_adj);
let mut f = fs::File::create("/proc/self/oom_score_adj")?;
f.write_all(oom_score_adj.to_string().as_bytes())?;
}
// Make the process non-dumpable, to avoid various race conditions that
// could cause processes in namespaces we're joining to access host
// resources (or potentially execute code).
//
// However, if the number of namespaces we are joining is 0, we are not
// going to be switching to a different security context. Thus setting
// ourselves to be non-dumpable only breaks things (like rootless
// containers), which is the recommendation from the kernel folks.
if linux.namespaces().is_some() {
prctl::set_dumpable(false).unwrap();
}
// This container_args will be passed to the container processes,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let container_args = ContainerArgs {
container_type: self.container_type,
syscall: self.syscall,
spec: self.spec,
rootfs: &self.rootfs,
console_socket: self.console_socket,
notify_socket,
preserve_fds: self.preserve_fds,
container: &self.container,
rootless: &self.rootless,
cgroup_manager: cmanager,
detached: self.detached,
};
let (intermediate, init_pid) =
process::container_main_process::container_main_process(&container_args)?;
// if file to write the pid to is specified, write pid of the child
if let Some(pid_file) = &self.pid_file {
fs::write(pid_file, format!("{}", init_pid)).context("failed to write pid file")?;
}
if let Some(container) = &mut self.container {
// update status and pid of the container process
container
.set_status(ContainerStatus::Created)
.set_creator(nix::unistd::geteuid().as_raw())
.set_pid(init_pid.as_raw())
.save()
.context("Failed to save container state")?;
}
Ok(intermediate)
}
fn cleanup_container(&self) -> Result<()> {
let linux = self.spec.linux().as_ref().context("no linux in spec")?;
let cgroups_path = utils::get_cgroup_path(
linux.cgroups_path(),
&self.container_id,
self.rootless.is_some(),
);
let cmanager = libcgroups::common::create_cgroup_manager(
&cgroups_path,
self.use_systemd || self.rootless.is_some(),
&self.container_id,
)?;
let mut errors = Vec::new();
if let Err(e) = cmanager.remove().context("failed to remove cgroup") {
errors.push(e.to_string());
}
if let Some(container) = &self.container {
if container.root.exists() {
if let Err(e) = fs::remove_dir_all(&container.root)
.with_context(|| format!("could not delete {:?}", container.root))
{
errors.push(e.to_string());
}
}
}
if !errors.is_empty() {
bail!("failed to cleanup container: {}", errors.join(";"));
}
Ok(())
}
}