1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
use std::fs;
use std::io::Write;
use std::os::fd::{AsRawFd, OwnedFd};
use std::path::PathBuf;
use std::rc::Rc;
use libcgroups::common::CgroupManager;
use nix::unistd::Pid;
use oci_spec::runtime::Spec;
use super::{Container, ContainerStatus};
use crate::error::{CreateContainerError, LibcontainerError, MissingSpecError};
use crate::notify_socket::NotifyListener;
use crate::process::args::{ContainerArgs, ContainerType};
use crate::process::intel_rdt::delete_resctrl_subdirectory;
use crate::process::{self};
use crate::syscall::syscall::SyscallType;
use crate::user_ns::UserNamespaceConfig;
use crate::utils;
use crate::workload::Executor;
pub(super) struct ContainerBuilderImpl {
/// Flag indicating if an init or a tenant container should be created
pub container_type: ContainerType,
/// Interface to operating system primitives
pub syscall: SyscallType,
/// Flag indicating if systemd should be used for cgroup management
pub use_systemd: bool,
/// Id of the container
pub container_id: String,
/// OCI compliant runtime spec
pub spec: Rc<Spec>,
/// Root filesystem of the container
pub rootfs: PathBuf,
/// File which will be used to communicate the pid of the
/// container process to the higher level runtime
pub pid_file: Option<PathBuf>,
/// Socket to communicate the file descriptor of the ptty
pub console_socket: Option<OwnedFd>,
/// Options for new user namespace
pub user_ns_config: Option<UserNamespaceConfig>,
/// Path to the Unix Domain Socket to communicate container start
pub notify_path: PathBuf,
/// Container state
pub container: Option<Container>,
/// File descriptos preserved/passed to the container init process.
pub preserve_fds: i32,
/// If the container is to be run in detached mode
pub detached: bool,
/// Default executes the specified execution of a generic command
pub executor: Box<dyn Executor>,
/// If do not use pivot root to jail process inside rootfs
pub no_pivot: bool,
// RawFd set to stdin of the container init process.
pub stdin: Option<OwnedFd>,
// RawFd set to stdout of the container init process.
pub stdout: Option<OwnedFd>,
// RawFd set to stderr of the container init process.
pub stderr: Option<OwnedFd>,
// Indicate if the init process should be a sibling of the main process.
pub as_sibling: bool,
}
impl ContainerBuilderImpl {
pub(super) fn create(&mut self) -> Result<Pid, LibcontainerError> {
match self.run_container() {
Ok(pid) => Ok(pid),
Err(outer) => {
// Only the init container should be cleaned up in the case of
// an error.
let cleanup_err = if self.is_init_container() {
self.cleanup_container().err()
} else {
None
};
Err(CreateContainerError::new(outer, cleanup_err).into())
}
}
}
fn is_init_container(&self) -> bool {
matches!(self.container_type, ContainerType::InitContainer)
}
fn run_container(&mut self) -> Result<Pid, LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cgroup_config = libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_owned(),
};
let process = self
.spec
.process()
.as_ref()
.ok_or(MissingSpecError::Process)?;
// Need to create the notify socket before we pivot root, since the unix
// domain socket used here is outside of the rootfs of container. During
// exec, need to create the socket before we enter into existing mount
// namespace. We also need to create to socket before entering into the
// user namespace in the case that the path is located in paths only
// root can access.
let notify_listener = NotifyListener::new(&self.notify_path)?;
// If Out-of-memory score adjustment is set in specification. set the score
// value for the current process check
// https://dev.to/rrampage/surviving-the-linux-oom-killer-2ki9 for some more
// information.
//
// This has to be done before !dumpable because /proc/self/oom_score_adj
// is not writeable unless you're an privileged user (if !dumpable is
// set). All children inherit their parent's oom_score_adj value on
// fork(2) so this will always be propagated properly.
if let Some(oom_score_adj) = process.oom_score_adj() {
tracing::debug!("Set OOM score to {}", oom_score_adj);
let mut f = fs::File::create("/proc/self/oom_score_adj").map_err(|err| {
tracing::error!("failed to open /proc/self/oom_score_adj: {}", err);
LibcontainerError::OtherIO(err)
})?;
f.write_all(oom_score_adj.to_string().as_bytes())
.map_err(|err| {
tracing::error!("failed to write to /proc/self/oom_score_adj: {}", err);
LibcontainerError::OtherIO(err)
})?;
}
// Make the process non-dumpable, to avoid various race conditions that
// could cause processes in namespaces we're joining to access host
// resources (or potentially execute code).
//
// However, if the number of namespaces we are joining is 0, we are not
// going to be switching to a different security context. Thus setting
// ourselves to be non-dumpable only breaks things (like rootless
// containers), which is the recommendation from the kernel folks.
if linux.namespaces().is_some() {
prctl::set_dumpable(false).map_err(|e| {
LibcontainerError::Other(format!(
"error in setting dumpable to false : {}",
nix::errno::Errno::from_raw(e)
))
})?;
}
// This container_args will be passed to the container processes,
// therefore we will have to move all the variable by value. Since self
// is a shared reference, we have to clone these variables here.
let container_args = ContainerArgs {
container_type: self.container_type,
syscall: self.syscall,
spec: Rc::clone(&self.spec),
rootfs: self.rootfs.to_owned(),
console_socket: self.console_socket.as_ref().map(|c| c.as_raw_fd()),
notify_listener,
preserve_fds: self.preserve_fds,
container: self.container.to_owned(),
user_ns_config: self.user_ns_config.to_owned(),
cgroup_config,
detached: self.detached,
executor: self.executor.clone(),
no_pivot: self.no_pivot,
stdin: self.stdin.as_ref().map(|x| x.as_raw_fd()),
stdout: self.stdout.as_ref().map(|x| x.as_raw_fd()),
stderr: self.stderr.as_ref().map(|x| x.as_raw_fd()),
as_sibling: self.as_sibling,
pid_file: self.pid_file.to_owned(),
};
let (init_pid, need_to_clean_up_intel_rdt_dir) =
process::container_main_process::container_main_process(&container_args).map_err(
|err| {
tracing::error!("failed to run container process {}", err);
LibcontainerError::MainProcess(err)
},
)?;
if let Some(container) = &mut self.container {
// update status and pid of the container process
container
.set_status(ContainerStatus::Created)
.set_creator(nix::unistd::geteuid().as_raw())
.set_pid(init_pid.as_raw())
.set_clean_up_intel_rdt_directory(need_to_clean_up_intel_rdt_dir)
.save()?;
}
Ok(init_pid)
}
fn cleanup_container(&self) -> Result<(), LibcontainerError> {
let linux = self.spec.linux().as_ref().ok_or(MissingSpecError::Linux)?;
let cgroups_path = utils::get_cgroup_path(linux.cgroups_path(), &self.container_id);
let cmanager =
libcgroups::common::create_cgroup_manager(libcgroups::common::CgroupConfig {
cgroup_path: cgroups_path,
systemd_cgroup: self.use_systemd || self.user_ns_config.is_some(),
container_name: self.container_id.to_string(),
})?;
let mut errors = Vec::new();
if let Err(e) = cmanager.remove() {
tracing::error!(error = ?e, "failed to remove cgroup manager");
errors.push(e.to_string());
}
if let Some(container) = &self.container {
if let Some(true) = container.clean_up_intel_rdt_subdirectory() {
if let Err(e) = delete_resctrl_subdirectory(container.id()) {
tracing::error!(id = ?container.id(), error = ?e, "failed to delete resctrl subdirectory");
errors.push(e.to_string());
}
}
if container.root.exists() {
if let Err(e) = fs::remove_dir_all(&container.root) {
tracing::error!(container_root = ?container.root, error = ?e, "failed to delete container root");
errors.push(e.to_string());
}
}
}
if !errors.is_empty() {
return Err(LibcontainerError::Other(format!(
"failed to cleanup container: {}",
errors.join(";")
)));
}
Ok(())
}
}