1use super::utils::{ro_bind_subentries_keep_symlinks, CLONE_NEWTIME};
2use super::{Bind, Container, ContainerRunner, ContainerRunnerResponse, Options, SetEnv, Stage};
3use crate::bwrap::bwrap;
4use crate::cgroup::{cgroup_init, cgroup_postexec, cgroup_preexec};
5use crate::filesystem;
6use crate::slirp::slirp;
7use crate::socket_pair::{set_cloexec, socket_pair};
8use crate::{msg_and, msg_ret, ok_or, some_or, true_or};
9use bincode;
10use libc::{close, unshare};
11use std::borrow::Cow;
12use std::collections::HashMap;
13use std::env;
14use std::ffi::OsString;
15use std::fs;
16use std::io::{self, Read, Write};
17use std::os::unix::process::ExitStatusExt;
18use std::path::Path;
19use std::process::ExitStatus;
20
21fn run_slirp(_container: &Container, response: &ContainerRunnerResponse) -> bool {
22 let (mut slirp_stream, slirp_sock) = some_or!(
23 socket_pair(),
24 msg_and!("Cannot create socket pair"; return false)
25 );
26
27 let mut args = Vec::<OsString>::new();
28 args.push("--configure".into());
29 args.push("--ready-fd".into());
30 args.push(slirp_sock.to_string().into());
31 args.push("--enable-ipv6".into());
32 args.push("--disable-host-loopback".into());
33 args.push(response.pid.to_string().into());
34 args.push("tap0".into());
35
36 let result = slirp(args).map(|_| ());
37 true_or!(unsafe { set_cloexec(slirp_sock) }, return false);
38 unsafe { close(slirp_sock) };
39
40 ok_or!(
41 result,
42 msg_and!("Failed to run slirp4netns: {}", result.unwrap_err(); return false)
43 );
44 ok_or!(
45 slirp_stream.read_exact(&mut [0u8]),
46 msg_and!("slirp init failed"; return false)
47 );
48 true
49}
50
51fn process_env(container: &Container, env: &[(OsString, OsString)]) -> Vec<(OsString, OsString)> {
52 let mut env_map = HashMap::new();
53 env_map.insert(
54 "PATH".into(),
55 "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".into(),
56 );
57
58 if container.keep_env {
59 env_map = HashMap::with_capacity(env.len());
60 for (k, v) in env {
61 env_map.insert(k.to_owned(), v.to_owned());
62 }
63 }
64 for option in &container.options {
65 match option {
66 Options::SetEnv(SetEnv { key, value }) => {
67 env_map.insert(key.to_owned(), value.to_owned());
68 }
69 Options::UnsetEnv(key) => {
70 env_map.remove(key);
71 }
72 _ => (),
73 };
74 }
75 env_map.into_iter().collect()
76}
77
78fn cleanup_container(container: &mut Container) {
79 container.keep_env = false;
80 container.base_image = None;
81 container.options.retain_mut(|option| match option {
82 Options::SetEnv(_) | Options::UnsetEnv(_) => false,
83 Options::Bind(Bind { src, dest: _ }) => {
84 *src = "".into();
85 true
86 }
87 Options::DevBind(Bind { src, dest: _ }) => {
88 *src = "".into();
89 true
90 }
91 Options::RoBind(Bind { src, dest: _ }) => {
92 *src = "".into();
93 true
94 }
95 Options::Symlink(_) => true,
96 Options::Dir(_) => true,
97 });
98}
99
100fn run_nft(rules: &Path) -> bool {
101 let mut args = Vec::<OsString>::new();
102 args.push("--unshare-ipc".into());
103 args.push("--unshare-pid".into());
104 args.push("--unshare-uts".into());
105 args.push("--unshare-cgroup".into());
106 args.push("--uid".into());
107 args.push("0".into());
108 args.push("--gid".into());
109 args.push("0".into());
110 args.push("--hostname".into());
111 args.push("".into());
112 args.push("--chdir".into());
113 args.push("/".into());
114 args.append(&mut ok_or!(
115 ro_bind_subentries_keep_symlinks("/container_staging_image", "/"),
116 msg_and!("Failed binding staging image"; return false)
117 ));
118 args.push("--ro-bind".into());
119 args.push(rules.to_owned().into());
120 args.push("/container_net_nft_rules".into());
121 args.push("--die-with-parent".into());
122 args.push("--cap-drop".into());
123 args.push("all".into());
124 args.push("--cap-add".into());
125 args.push("cap_net_admin".into());
126 args.push("--".into());
127
128 args.push("/usr/sbin/nft".into());
129 args.push("-f".into());
130 args.push("/container_net_nft_rules".into());
131
132 let exit_status = ok_or!(ok_or!(bwrap(args, true), return false).wait(), return false);
133 true_or!(exit_status.success(), return false);
134 true
135}
136
137pub fn ro_bind_filesystem<D>(dest: D) -> io::Result<Vec<OsString>>
138where
139 D: AsRef<Path>,
140{
141 let dest: &Path = dest.as_ref();
142
143 let mut result = Vec::new();
144 filesystem::iterate(false, |file_name, symlink| match symlink {
145 None => {
146 result.push("--ro-bind".into());
147 result.push(Path::new("/").join(file_name).into());
148 result.push(dest.join(file_name).into());
149 }
150 Some(symlink) => {
151 result.push("--symlink".into());
152 result.push(symlink.into());
153 result.push(dest.join(file_name).into());
154 }
155 })?;
156 Ok(result)
157}
158
159pub fn run_container(
160 stage: u8,
161 container: &Container,
162 env: &[(OsString, OsString)],
163 wait: bool,
164) -> Option<ExitStatus> {
165 true_or!(cgroup_init(stage == 0), return None);
166
167 let env: Cow<_> = if stage == 0 {
168 Cow::Owned(process_env(container, env))
169 } else {
170 Cow::Borrowed(env)
171 };
172
173 if !container.share_time && stage > 0 {
174 true_or!(unsafe { unshare(CLONE_NEWTIME) } == 0, return None);
175 }
176 if stage == 4 {
177 ok_or!(
179 fs::write("/container_net_nft_rules", &container.net_nft_rules),
180 return None
181 );
182 let result = run_nft(Path::new("/container_net_nft_rules"));
183 ok_or!(fs::remove_file("/container_net_nft_rules"), return None);
184 true_or!(result, return None);
185 }
186
187 let mut args = Vec::<OsString>::new();
188 args.push("--unshare-user".into());
189 args.push("--unshare-ipc".into());
190 if stage != 1 && stage != 3 && stage != 5 {
191 args.push("--unshare-pid".into());
192 }
193 if !container.share_net || (stage != 0 && stage != 2 && stage != 4 && stage != 6) {
194 args.push("--unshare-net".into());
195 }
196 args.push("--unshare-uts".into());
197 args.push("--unshare-cgroup".into());
198 args.push("--uid".into());
199 args.push("0".into());
200 args.push("--gid".into());
201 args.push("0".into());
202 args.push("--hostname".into());
203 args.push("container".into());
204 args.push("--chdir".into());
205 args.push("/".into());
206 args.push("--die-with-parent".into());
207 args.push("--cap-drop".into());
208 args.push("all".into());
209 args.push("--cap-add".into());
210 args.push("cap_setfcap".into());
211 args.push("--cap-add".into());
212 args.push("cap_sys_admin".into());
213 if stage == 3 {
214 args.push("--cap-add".into());
215 args.push("cap_net_admin".into());
216 }
217
218 if stage == 0 {
219 if let Some(base_image) = &container.base_image {
220 args.append(&mut ok_or!(
221 ro_bind_subentries_keep_symlinks(base_image, "/"),
222 msg_ret!("Failed binding staging image")
223 ));
224 args.append(&mut ok_or!(
225 ro_bind_subentries_keep_symlinks(base_image, "/container_staging_image"),
226 msg_ret!("Failed binding staging image")
227 ));
228 } else {
229 args.append(&mut ok_or!(
230 ro_bind_filesystem("/"),
231 msg_ret!("Failed binding staging image")
232 ));
233 args.append(&mut ok_or!(
234 ro_bind_filesystem("/container_staging_image"),
235 msg_ret!("Failed binding staging image")
236 ));
237 }
238 let current_exe = ok_or!(env::current_exe(), msg_ret!("Failed getting current exe"));
239 args.push("--ro-bind".into());
240 args.push(current_exe.clone().into());
241 args.push("/keg-bin".into());
242 args.push("--ro-bind".into());
243 args.push(current_exe.into());
244 args.push("/container_staging_image/keg-bin".into());
245 } else {
246 args.append(&mut ok_or!(
247 ro_bind_subentries_keep_symlinks("/container_staging_image", "/"),
248 msg_ret!("Failed binding staging image")
249 ));
250 args.append(&mut ok_or!(
251 ro_bind_subentries_keep_symlinks(
252 "/container_staging_image",
253 "/container_staging_image"
254 ),
255 msg_ret!("Failed binding staging image")
256 ));
257 }
258
259 args.push("--proc".into());
260 args.push("/proc".into());
261 args.push("--tmpfs".into());
262 args.push("/tmp".into());
263 args.push("--dev".into());
264 args.push("/dev".into());
265 args.push("--mqueue".into());
266 args.push("/dev/mqueue".into());
267 args.push("--dev-bind".into());
268 args.push("/dev/fuse".into());
269 args.push("/dev/fuse".into());
270 args.push("--dev-bind".into());
271 args.push("/dev/net/tun".into());
272 args.push("/dev/net/tun".into());
273
274 let mut bind_index: u64 = 0;
275 for option in &container.options {
276 let bind = match option {
278 Options::Bind(bind) => Some(bind),
279 Options::DevBind(bind) => Some(bind),
280 Options::RoBind(bind) => Some(bind),
281 _ => None,
282 };
283 match option {
284 Options::Bind(_) => args.push("--bind".into()),
285 Options::DevBind(_) => args.push("--dev-bind".into()),
286 Options::RoBind(_) => args.push("--ro-bind".into()),
287 _ => (),
288 }
289 if let Some(Bind { src, dest: _ }) = bind {
290 if stage == 0 {
291 args.push(src.clone());
292 } else {
293 args.push(("/container_bind_".to_owned() + &bind_index.to_string()).into());
294 }
295 args.push(("/container_bind_".to_owned() + &bind_index.to_string()).into());
296 bind_index += 1;
297 }
298 }
299 let (mut stream, sock) = some_or!(socket_pair(), msg_ret!("Cannot create socket pair"));
300 args.push("--".into());
303 args.push("/keg-bin".into());
304 args.push("--inner".into());
305 args.push(sock.to_string().into());
306
307 true_or!(cgroup_preexec(stage == 0), return None);
308 let result = bwrap(args, true);
309 true_or!(unsafe { set_cloexec(sock) }, return None);
310 unsafe { close(sock) };
311
312 let mut child = match result {
313 Ok(child) => child,
314 Err(e) => {
315 eprintln!("Cannot run bwrap: {e}");
316 return None;
317 }
318 };
319
320 let mut container_clone = container.clone();
321 if stage == 0 {
322 cleanup_container(&mut container_clone);
324 }
325 if stage == 4 {
326 container_clone.net_nft_rules = Vec::new();
328 }
329 let runner = ContainerRunner {
330 stage: if stage <= 5 {
331 Stage::Isolation(stage + 1)
332 } else {
333 Stage::Mounting
334 },
335 container: container_clone,
336 env: env.into_owned(),
337 };
338 ok_or!(
339 stream.write_all(&ok_or!(bincode::serialize(&runner), return None)),
340 return None
341 );
342 let response: ContainerRunnerResponse =
343 ok_or!(bincode::deserialize_from(&mut stream), return None);
344 true_or!(cgroup_postexec(stage == 0), return None);
345 if stage == 1 || stage == 3 || stage == 5 {
346 true_or!(run_slirp(&container, &response), return None);
347 }
348 ok_or!(stream.write_all(&[0u8]), return None);
349
350 if wait {
351 child.wait().ok()
352 } else {
353 Some(ExitStatus::from_raw(0))
354 }
355}