keg/container/
isolation.rs

1use super::utils::{ro_bind_subentries_keep_symlinks, CLONE_NEWTIME};
2use super::{Bind, Container, ContainerRunner, ContainerRunnerResponse, Options, SetEnv, Stage};
3use crate::bwrap::bwrap;
4use crate::cgroup::{cgroup_init, cgroup_postexec, cgroup_preexec};
5use crate::filesystem;
6use crate::slirp::slirp;
7use crate::socket_pair::{set_cloexec, socket_pair};
8use crate::{msg_and, msg_ret, ok_or, some_or, true_or};
9use bincode;
10use libc::{close, unshare};
11use std::borrow::Cow;
12use std::collections::HashMap;
13use std::env;
14use std::ffi::OsString;
15use std::fs;
16use std::io::{self, Read, Write};
17use std::os::unix::process::ExitStatusExt;
18use std::path::Path;
19use std::process::ExitStatus;
20
21fn run_slirp(_container: &Container, response: &ContainerRunnerResponse) -> bool {
22    let (mut slirp_stream, slirp_sock) = some_or!(
23        socket_pair(),
24        msg_and!("Cannot create socket pair"; return false)
25    );
26
27    let mut args = Vec::<OsString>::new();
28    args.push("--configure".into());
29    args.push("--ready-fd".into());
30    args.push(slirp_sock.to_string().into());
31    args.push("--enable-ipv6".into());
32    args.push("--disable-host-loopback".into());
33    args.push(response.pid.to_string().into());
34    args.push("tap0".into());
35
36    let result = slirp(args).map(|_| ());
37    true_or!(unsafe { set_cloexec(slirp_sock) }, return false);
38    unsafe { close(slirp_sock) };
39
40    ok_or!(
41        result,
42        msg_and!("Failed to run slirp4netns: {}", result.unwrap_err(); return false)
43    );
44    ok_or!(
45        slirp_stream.read_exact(&mut [0u8]),
46        msg_and!("slirp init failed"; return false)
47    );
48    true
49}
50
51fn process_env(container: &Container, env: &[(OsString, OsString)]) -> Vec<(OsString, OsString)> {
52    let mut env_map = HashMap::new();
53    env_map.insert(
54        "PATH".into(),
55        "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".into(),
56    );
57
58    if container.keep_env {
59        env_map = HashMap::with_capacity(env.len());
60        for (k, v) in env {
61            env_map.insert(k.to_owned(), v.to_owned());
62        }
63    }
64    for option in &container.options {
65        match option {
66            Options::SetEnv(SetEnv { key, value }) => {
67                env_map.insert(key.to_owned(), value.to_owned());
68            }
69            Options::UnsetEnv(key) => {
70                env_map.remove(key);
71            }
72            _ => (),
73        };
74    }
75    env_map.into_iter().collect()
76}
77
78fn cleanup_container(container: &mut Container) {
79    container.keep_env = false;
80    container.base_image = None;
81    container.options.retain_mut(|option| match option {
82        Options::SetEnv(_) | Options::UnsetEnv(_) => false,
83        Options::Bind(Bind { src, dest: _ }) => {
84            *src = "".into();
85            true
86        }
87        Options::DevBind(Bind { src, dest: _ }) => {
88            *src = "".into();
89            true
90        }
91        Options::RoBind(Bind { src, dest: _ }) => {
92            *src = "".into();
93            true
94        }
95        Options::Symlink(_) => true,
96        Options::Dir(_) => true,
97    });
98}
99
100fn run_nft(rules: &Path) -> bool {
101    let mut args = Vec::<OsString>::new();
102    args.push("--unshare-ipc".into());
103    args.push("--unshare-pid".into());
104    args.push("--unshare-uts".into());
105    args.push("--unshare-cgroup".into());
106    args.push("--uid".into());
107    args.push("0".into());
108    args.push("--gid".into());
109    args.push("0".into());
110    args.push("--hostname".into());
111    args.push("".into());
112    args.push("--chdir".into());
113    args.push("/".into());
114    args.append(&mut ok_or!(
115        ro_bind_subentries_keep_symlinks("/container_staging_image", "/"),
116        msg_and!("Failed binding staging image"; return false)
117    ));
118    args.push("--ro-bind".into());
119    args.push(rules.to_owned().into());
120    args.push("/container_net_nft_rules".into());
121    args.push("--die-with-parent".into());
122    args.push("--cap-drop".into());
123    args.push("all".into());
124    args.push("--cap-add".into());
125    args.push("cap_net_admin".into());
126    args.push("--".into());
127
128    args.push("/usr/sbin/nft".into());
129    args.push("-f".into());
130    args.push("/container_net_nft_rules".into());
131
132    let exit_status = ok_or!(ok_or!(bwrap(args, true), return false).wait(), return false);
133    true_or!(exit_status.success(), return false);
134    true
135}
136
137pub fn ro_bind_filesystem<D>(dest: D) -> io::Result<Vec<OsString>>
138where
139    D: AsRef<Path>,
140{
141    let dest: &Path = dest.as_ref();
142
143    let mut result = Vec::new();
144    filesystem::iterate(false, |file_name, symlink| match symlink {
145        None => {
146            result.push("--ro-bind".into());
147            result.push(Path::new("/").join(file_name).into());
148            result.push(dest.join(file_name).into());
149        }
150        Some(symlink) => {
151            result.push("--symlink".into());
152            result.push(symlink.into());
153            result.push(dest.join(file_name).into());
154        }
155    })?;
156    Ok(result)
157}
158
159pub fn run_container(
160    stage: u8,
161    container: &Container,
162    env: &[(OsString, OsString)],
163    wait: bool,
164) -> Option<ExitStatus> {
165    true_or!(cgroup_init(stage == 0), return None);
166
167    let env: Cow<_> = if stage == 0 {
168        Cow::Owned(process_env(container, env))
169    } else {
170        Cow::Borrowed(env)
171    };
172
173    if !container.share_time && stage > 0 {
174        true_or!(unsafe { unshare(CLONE_NEWTIME) } == 0, return None);
175    }
176    if stage == 4 {
177        // Load nft rules and **make sure** the load succeeds.
178        ok_or!(
179            fs::write("/container_net_nft_rules", &container.net_nft_rules),
180            return None
181        );
182        let result = run_nft(Path::new("/container_net_nft_rules"));
183        ok_or!(fs::remove_file("/container_net_nft_rules"), return None);
184        true_or!(result, return None);
185    }
186
187    let mut args = Vec::<OsString>::new();
188    args.push("--unshare-user".into());
189    args.push("--unshare-ipc".into());
190    if stage != 1 && stage != 3 && stage != 5 {
191        args.push("--unshare-pid".into());
192    }
193    if !container.share_net || (stage != 0 && stage != 2 && stage != 4 && stage != 6) {
194        args.push("--unshare-net".into());
195    }
196    args.push("--unshare-uts".into());
197    args.push("--unshare-cgroup".into());
198    args.push("--uid".into());
199    args.push("0".into());
200    args.push("--gid".into());
201    args.push("0".into());
202    args.push("--hostname".into());
203    args.push("container".into());
204    args.push("--chdir".into());
205    args.push("/".into());
206    args.push("--die-with-parent".into());
207    args.push("--cap-drop".into());
208    args.push("all".into());
209    args.push("--cap-add".into());
210    args.push("cap_setfcap".into());
211    args.push("--cap-add".into());
212    args.push("cap_sys_admin".into());
213    if stage == 3 {
214        args.push("--cap-add".into());
215        args.push("cap_net_admin".into());
216    }
217
218    if stage == 0 {
219        if let Some(base_image) = &container.base_image {
220            args.append(&mut ok_or!(
221                ro_bind_subentries_keep_symlinks(base_image, "/"),
222                msg_ret!("Failed binding staging image")
223            ));
224            args.append(&mut ok_or!(
225                ro_bind_subentries_keep_symlinks(base_image, "/container_staging_image"),
226                msg_ret!("Failed binding staging image")
227            ));
228        } else {
229            args.append(&mut ok_or!(
230                ro_bind_filesystem("/"),
231                msg_ret!("Failed binding staging image")
232            ));
233            args.append(&mut ok_or!(
234                ro_bind_filesystem("/container_staging_image"),
235                msg_ret!("Failed binding staging image")
236            ));
237        }
238        let current_exe = ok_or!(env::current_exe(), msg_ret!("Failed getting current exe"));
239        args.push("--ro-bind".into());
240        args.push(current_exe.clone().into());
241        args.push("/keg-bin".into());
242        args.push("--ro-bind".into());
243        args.push(current_exe.into());
244        args.push("/container_staging_image/keg-bin".into());
245    } else {
246        args.append(&mut ok_or!(
247            ro_bind_subentries_keep_symlinks("/container_staging_image", "/"),
248            msg_ret!("Failed binding staging image")
249        ));
250        args.append(&mut ok_or!(
251            ro_bind_subentries_keep_symlinks(
252                "/container_staging_image",
253                "/container_staging_image"
254            ),
255            msg_ret!("Failed binding staging image")
256        ));
257    }
258
259    args.push("--proc".into());
260    args.push("/proc".into());
261    args.push("--tmpfs".into());
262    args.push("/tmp".into());
263    args.push("--dev".into());
264    args.push("/dev".into());
265    args.push("--mqueue".into());
266    args.push("/dev/mqueue".into());
267    args.push("--dev-bind".into());
268    args.push("/dev/fuse".into());
269    args.push("/dev/fuse".into());
270    args.push("--dev-bind".into());
271    args.push("/dev/net/tun".into());
272    args.push("/dev/net/tun".into());
273
274    let mut bind_index: u64 = 0;
275    for option in &container.options {
276        // Binds
277        let bind = match option {
278            Options::Bind(bind) => Some(bind),
279            Options::DevBind(bind) => Some(bind),
280            Options::RoBind(bind) => Some(bind),
281            _ => None,
282        };
283        match option {
284            Options::Bind(_) => args.push("--bind".into()),
285            Options::DevBind(_) => args.push("--dev-bind".into()),
286            Options::RoBind(_) => args.push("--ro-bind".into()),
287            _ => (),
288        }
289        if let Some(Bind { src, dest: _ }) = bind {
290            if stage == 0 {
291                args.push(src.clone());
292            } else {
293                args.push(("/container_bind_".to_owned() + &bind_index.to_string()).into());
294            }
295            args.push(("/container_bind_".to_owned() + &bind_index.to_string()).into());
296            bind_index += 1;
297        }
298    }
299    let (mut stream, sock) = some_or!(socket_pair(), msg_ret!("Cannot create socket pair"));
300    // TODO: Close the other socket on error
301
302    args.push("--".into());
303    args.push("/keg-bin".into());
304    args.push("--inner".into());
305    args.push(sock.to_string().into());
306
307    true_or!(cgroup_preexec(stage == 0), return None);
308    let result = bwrap(args, true);
309    true_or!(unsafe { set_cloexec(sock) }, return None);
310    unsafe { close(sock) };
311
312    let mut child = match result {
313        Ok(child) => child,
314        Err(e) => {
315            eprintln!("Cannot run bwrap: {e}");
316            return None;
317        }
318    };
319
320    let mut container_clone = container.clone();
321    if stage == 0 {
322        // Remove information we already applied.
323        cleanup_container(&mut container_clone);
324    }
325    if stage == 4 {
326        // nft rules already applied.
327        container_clone.net_nft_rules = Vec::new();
328    }
329    let runner = ContainerRunner {
330        stage: if stage <= 5 {
331            Stage::Isolation(stage + 1)
332        } else {
333            Stage::Mounting
334        },
335        container: container_clone,
336        env: env.into_owned(),
337    };
338    ok_or!(
339        stream.write_all(&ok_or!(bincode::serialize(&runner), return None)),
340        return None
341    );
342    let response: ContainerRunnerResponse =
343        ok_or!(bincode::deserialize_from(&mut stream), return None);
344    true_or!(cgroup_postexec(stage == 0), return None);
345    if stage == 1 || stage == 3 || stage == 5 {
346        true_or!(run_slirp(&container, &response), return None);
347    }
348    ok_or!(stream.write_all(&[0u8]), return None);
349
350    if wait {
351        child.wait().ok()
352    } else {
353        Some(ExitStatus::from_raw(0))
354    }
355}