krataguest/
init.rs

1use anyhow::{anyhow, Result};
2use cgroups_rs::{Cgroup, CgroupPid};
3use futures::stream::TryStreamExt;
4use ipnetwork::IpNetwork;
5use krata::ethtool::EthtoolHandle;
6use krata::idm::client::IdmInternalClient;
7use krata::idm::internal::INTERNAL_IDM_CHANNEL;
8use krata::launchcfg::{LaunchInfo, LaunchNetwork, LaunchPackedFormat};
9use libc::{sethostname, setsid, TIOCSCTTY};
10use log::{trace, warn};
11use nix::ioctl_write_int_bad;
12use nix::unistd::{dup2, execve, fork, ForkResult, Pid};
13use oci_spec::image::{Config, ImageConfiguration};
14use path_absolutize::Absolutize;
15use platform_info::{PlatformInfo, PlatformInfoAPI, UNameAPI};
16use std::collections::HashMap;
17use std::ffi::CString;
18use std::fs::{File, OpenOptions, Permissions};
19use std::io;
20use std::net::{Ipv4Addr, Ipv6Addr};
21use std::os::fd::AsRawFd;
22use std::os::unix::ffi::OsStrExt;
23use std::os::unix::fs::{chroot, PermissionsExt};
24use std::path::{Path, PathBuf};
25use std::str::FromStr;
26use sys_mount::{FilesystemType, Mount, MountFlags};
27use tokio::fs;
28
29use crate::background::GuestBackground;
30
31const IMAGE_BLOCK_DEVICE_PATH: &str = "/dev/xvda";
32const CONFIG_BLOCK_DEVICE_PATH: &str = "/dev/xvdb";
33
34const IMAGE_MOUNT_PATH: &str = "/image";
35const CONFIG_MOUNT_PATH: &str = "/config";
36const OVERLAY_MOUNT_PATH: &str = "/overlay";
37
38const OVERLAY_IMAGE_BIND_PATH: &str = "/overlay/image";
39const OVERLAY_WORK_PATH: &str = "/overlay/work";
40const OVERLAY_UPPER_PATH: &str = "/overlay/upper";
41
42const SYS_PATH: &str = "/sys";
43const PROC_PATH: &str = "/proc";
44const DEV_PATH: &str = "/dev";
45
46const NEW_ROOT_PATH: &str = "/newroot";
47const NEW_ROOT_SYS_PATH: &str = "/newroot/sys";
48const NEW_ROOT_PROC_PATH: &str = "/newroot/proc";
49const NEW_ROOT_DEV_PATH: &str = "/newroot/dev";
50
51const IMAGE_CONFIG_JSON_PATH: &str = "/config/image/config.json";
52const LAUNCH_CONFIG_JSON_PATH: &str = "/config/launch.json";
53
54const ADDONS_DEVICE_PATH: &str = "/dev/xvdc";
55const ADDONS_MOUNT_PATH: &str = "/addons";
56const ADDONS_MODULES_PATH: &str = "/addons/modules";
57
58ioctl_write_int_bad!(set_controlling_terminal, TIOCSCTTY);
59
60pub struct GuestInit {}
61
62impl Default for GuestInit {
63    fn default() -> Self {
64        Self::new()
65    }
66}
67
68impl GuestInit {
69    pub fn new() -> GuestInit {
70        GuestInit {}
71    }
72
73    pub async fn init(&mut self) -> Result<()> {
74        self.early_init().await?;
75
76        trace!("opening console descriptor");
77        match OpenOptions::new()
78            .read(true)
79            .write(true)
80            .open("/dev/console")
81        {
82            Ok(console) => self.map_console(&console)?,
83            Err(error) => warn!("failed to open console: {}", error),
84        };
85
86        let idm = IdmInternalClient::open(INTERNAL_IDM_CHANNEL, "/dev/hvc1")
87            .await
88            .map_err(|x| anyhow!("failed to open idm client: {}", x))?;
89        self.mount_config_image().await?;
90
91        let config = self.parse_image_config().await?;
92        let launch = self.parse_launch_config().await?;
93
94        self.mount_root_image(launch.root.format.clone()).await?;
95
96        self.mount_addons().await?;
97
98        self.mount_new_root().await?;
99        self.mount_kernel_modules().await?;
100        self.bind_new_root().await?;
101
102        if let Some(hostname) = launch.hostname.clone() {
103            let result = unsafe {
104                sethostname(
105                    hostname.as_bytes().as_ptr() as *mut libc::c_char,
106                    hostname.len(),
107                )
108            };
109            if result != 0 {
110                warn!("failed to set hostname: {}", result);
111            }
112
113            let etc = PathBuf::from_str("/etc")?;
114            if !etc.exists() {
115                fs::create_dir(&etc).await?;
116            }
117            let mut etc_hostname = etc;
118            etc_hostname.push("hostname");
119            fs::write(&etc_hostname, hostname + "\n").await?;
120        }
121
122        if let Some(network) = &launch.network {
123            trace!("initializing network");
124            if let Err(error) = self.network_setup(&launch, network).await {
125                warn!("failed to initialize network: {}", error);
126            }
127        }
128
129        if let Some(cfg) = config.config() {
130            trace!("running guest task");
131            self.run(cfg, &launch, idm).await?;
132        } else {
133            return Err(anyhow!(
134                "unable to determine what to execute, image config doesn't tell us"
135            ));
136        }
137        Ok(())
138    }
139
140    async fn early_init(&mut self) -> Result<()> {
141        trace!("early init");
142        self.create_dir("/dev", Some(0o0755)).await?;
143        self.create_dir("/proc", None).await?;
144        self.create_dir("/sys", Some(0o0555)).await?;
145        self.create_dir("/root", Some(0o0700)).await?;
146        self.create_dir("/tmp", None).await?;
147        self.create_dir("/run", Some(0o0755)).await?;
148        self.mount_kernel_fs("devtmpfs", "/dev", "mode=0755", None, None)
149            .await?;
150        self.mount_kernel_fs("proc", "/proc", "", None, None)
151            .await?;
152        self.mount_kernel_fs("sysfs", "/sys", "", None, None)
153            .await?;
154        self.create_dir("/dev/pts", Some(0o0755)).await?;
155        self.mount_kernel_fs("devpts", "/dev/pts", "", None, Some("/dev/ptmx"))
156            .await?;
157        fs::symlink("/proc/self/fd", "/dev/fd").await?;
158        fs::symlink("/proc/self/fd/0", "/dev/stdin").await?;
159        fs::symlink("/proc/self/fd/1", "/dev/stdout").await?;
160        fs::symlink("/proc/self/fd/2", "/dev/stderr").await?;
161        self.mount_kernel_fs(
162            "cgroup2",
163            "/sys/fs/cgroup",
164            "",
165            Some(MountFlags::RELATIME),
166            None,
167        )
168        .await?;
169        Ok(())
170    }
171
172    async fn mount_addons(&mut self) -> Result<()> {
173        if !fs::try_exists(ADDONS_DEVICE_PATH).await? {
174            return Ok(());
175        }
176
177        self.mount_image(
178            &PathBuf::from(ADDONS_DEVICE_PATH),
179            &PathBuf::from(ADDONS_MOUNT_PATH),
180            LaunchPackedFormat::Squashfs,
181        )
182        .await?;
183        Ok(())
184    }
185
186    async fn mount_kernel_modules(&mut self) -> Result<()> {
187        if !fs::try_exists(ADDONS_MODULES_PATH).await? {
188            return Ok(());
189        }
190
191        let Some(platform_info) = PlatformInfo::new().ok() else {
192            return Ok(());
193        };
194
195        let kernel_release = platform_info.release().to_string_lossy().to_string();
196        let modules_path = format!("/newroot/lib/modules/{}", kernel_release);
197        fs::create_dir_all(&modules_path).await?;
198        Mount::builder()
199            .fstype(FilesystemType::Manual("none"))
200            .flags(MountFlags::BIND | MountFlags::RDONLY)
201            .mount(ADDONS_MODULES_PATH, modules_path)?;
202        Ok(())
203    }
204
205    async fn create_dir(&mut self, path: &str, mode: Option<u32>) -> Result<()> {
206        let path = Path::new(path);
207        if !path.is_dir() {
208            trace!("creating directory {:?}", path);
209            fs::create_dir(path).await?;
210        }
211        if let Some(mode) = mode {
212            let permissions = Permissions::from_mode(mode);
213            trace!("setting directory {:?} permissions to {:?}", path, mode);
214            fs::set_permissions(path, permissions).await?;
215        }
216        Ok(())
217    }
218
219    async fn mount_kernel_fs(
220        &mut self,
221        fstype: &str,
222        path: &str,
223        data: &str,
224        flags: Option<MountFlags>,
225        source: Option<&str>,
226    ) -> Result<()> {
227        trace!("mounting kernel fs {} to {}", fstype, path);
228        Mount::builder()
229            .fstype(FilesystemType::Manual(fstype))
230            .flags(MountFlags::NOEXEC | MountFlags::NOSUID | flags.unwrap_or(MountFlags::empty()))
231            .data(data)
232            .mount(source.unwrap_or(fstype), path)?;
233        Ok(())
234    }
235
236    fn map_console(&mut self, console: &File) -> Result<()> {
237        trace!("mapping console");
238        dup2(console.as_raw_fd(), 0)?;
239        dup2(console.as_raw_fd(), 1)?;
240        dup2(console.as_raw_fd(), 2)?;
241        Ok(())
242    }
243
244    async fn mount_config_image(&mut self) -> Result<()> {
245        trace!("mounting config image");
246        let config_mount_path = Path::new(CONFIG_MOUNT_PATH);
247        self.mount_image(
248            Path::new(CONFIG_BLOCK_DEVICE_PATH),
249            config_mount_path,
250            LaunchPackedFormat::Squashfs,
251        )
252        .await?;
253        Ok(())
254    }
255
256    async fn mount_root_image(&mut self, format: LaunchPackedFormat) -> Result<()> {
257        trace!("mounting root image");
258        let image_mount_path = Path::new(IMAGE_MOUNT_PATH);
259        self.mount_image(Path::new(IMAGE_BLOCK_DEVICE_PATH), image_mount_path, format)
260            .await?;
261        Ok(())
262    }
263
264    async fn mount_image(
265        &mut self,
266        from: &Path,
267        to: &Path,
268        format: LaunchPackedFormat,
269    ) -> Result<()> {
270        trace!("mounting {:?} image {:?} to {:?}", format, from, to);
271        if !to.is_dir() {
272            fs::create_dir(to).await?;
273        }
274        Mount::builder()
275            .fstype(FilesystemType::Manual(match format {
276                LaunchPackedFormat::Squashfs => "squashfs",
277                LaunchPackedFormat::Erofs => "erofs",
278            }))
279            .flags(MountFlags::RDONLY)
280            .mount(from, to)?;
281        Ok(())
282    }
283
284    async fn mount_move_subtree(&mut self, from: &Path, to: &Path) -> Result<()> {
285        trace!("moving subtree {:?} to {:?}", from, to);
286        if !to.is_dir() {
287            fs::create_dir(to).await?;
288        }
289        Mount::builder()
290            .fstype(FilesystemType::Manual("none"))
291            .flags(MountFlags::MOVE)
292            .mount(from, to)?;
293        Ok(())
294    }
295
296    async fn mount_new_root(&mut self) -> Result<()> {
297        trace!("mounting new root");
298        self.mount_overlay_tmpfs().await?;
299        self.bind_image_to_overlay_tmpfs().await?;
300        self.mount_overlay_to_new_root().await?;
301        std::env::set_current_dir(NEW_ROOT_PATH)?;
302        trace!("mounted new root");
303        Ok(())
304    }
305
306    async fn mount_overlay_tmpfs(&mut self) -> Result<()> {
307        fs::create_dir(OVERLAY_MOUNT_PATH).await?;
308        Mount::builder()
309            .fstype(FilesystemType::Manual("tmpfs"))
310            .mount("tmpfs", OVERLAY_MOUNT_PATH)?;
311        fs::create_dir(OVERLAY_UPPER_PATH).await?;
312        fs::create_dir(OVERLAY_WORK_PATH).await?;
313        Ok(())
314    }
315
316    async fn bind_image_to_overlay_tmpfs(&mut self) -> Result<()> {
317        fs::create_dir(OVERLAY_IMAGE_BIND_PATH).await?;
318        Mount::builder()
319            .fstype(FilesystemType::Manual("none"))
320            .flags(MountFlags::BIND | MountFlags::RDONLY)
321            .mount(IMAGE_MOUNT_PATH, OVERLAY_IMAGE_BIND_PATH)?;
322        Ok(())
323    }
324
325    async fn mount_overlay_to_new_root(&mut self) -> Result<()> {
326        fs::create_dir(NEW_ROOT_PATH).await?;
327        Mount::builder()
328            .fstype(FilesystemType::Manual("overlay"))
329            .flags(MountFlags::NOATIME)
330            .data(&format!(
331                "lowerdir={},upperdir={},workdir={}",
332                OVERLAY_IMAGE_BIND_PATH, OVERLAY_UPPER_PATH, OVERLAY_WORK_PATH
333            ))
334            .mount(format!("overlayfs:{}", OVERLAY_MOUNT_PATH), NEW_ROOT_PATH)?;
335        Ok(())
336    }
337
338    async fn parse_image_config(&mut self) -> Result<ImageConfiguration> {
339        let image_config_path = Path::new(IMAGE_CONFIG_JSON_PATH);
340        let content = fs::read_to_string(image_config_path).await?;
341        let config = serde_json::from_str(&content)?;
342        Ok(config)
343    }
344
345    async fn parse_launch_config(&mut self) -> Result<LaunchInfo> {
346        trace!("parsing launch config");
347        let launch_config = Path::new(LAUNCH_CONFIG_JSON_PATH);
348        let content = fs::read_to_string(launch_config).await?;
349        Ok(serde_json::from_str(&content)?)
350    }
351
352    async fn bind_new_root(&mut self) -> Result<()> {
353        self.mount_move_subtree(Path::new(SYS_PATH), Path::new(NEW_ROOT_SYS_PATH))
354            .await?;
355        self.mount_move_subtree(Path::new(PROC_PATH), Path::new(NEW_ROOT_PROC_PATH))
356            .await?;
357        self.mount_move_subtree(Path::new(DEV_PATH), Path::new(NEW_ROOT_DEV_PATH))
358            .await?;
359        trace!("binding new root");
360        Mount::builder()
361            .fstype(FilesystemType::Manual("none"))
362            .flags(MountFlags::BIND)
363            .mount(".", "/")?;
364        trace!("chrooting into new root");
365        chroot(".")?;
366        trace!("setting root as current directory");
367        std::env::set_current_dir("/")?;
368        Ok(())
369    }
370
371    async fn network_setup(&mut self, cfg: &LaunchInfo, network: &LaunchNetwork) -> Result<()> {
372        trace!("setting up network for link");
373
374        let etc = PathBuf::from_str("/etc")?;
375        if !etc.exists() {
376            fs::create_dir(etc).await?;
377        }
378        let resolv = PathBuf::from_str("/etc/resolv.conf")?;
379
380        {
381            let mut lines = vec!["# krata resolver configuration".to_string()];
382            for nameserver in &network.resolver.nameservers {
383                lines.push(format!("nameserver {}", nameserver));
384            }
385
386            let mut conf = lines.join("\n");
387            conf.push('\n');
388            fs::write(resolv, conf).await?;
389        }
390
391        let hosts = PathBuf::from_str("/etc/hosts")?;
392        if let Some(ref hostname) = cfg.hostname {
393            let mut lines = if hosts.exists() {
394                fs::read_to_string(&hosts)
395                    .await?
396                    .lines()
397                    .map(|x| x.to_string())
398                    .collect::<Vec<_>>()
399            } else {
400                vec!["127.0.0.1 localhost".to_string()]
401            };
402            lines.push(format!("127.0.1.1 {}", hostname));
403            fs::write(&hosts, lines.join("\n") + "\n").await?;
404        }
405
406        self.network_configure_ethtool(network).await?;
407        self.network_configure_link(network).await?;
408        Ok(())
409    }
410
411    async fn network_configure_link(&mut self, network: &LaunchNetwork) -> Result<()> {
412        let (connection, handle, _) = rtnetlink::new_connection()?;
413        tokio::spawn(connection);
414
415        let mut links = handle.link().get().match_name("lo".to_string()).execute();
416        let Some(link) = links.try_next().await? else {
417            warn!("unable to find link named lo");
418            return Ok(());
419        };
420
421        handle.link().set(link.header.index).up().execute().await?;
422
423        let ipv4_network: IpNetwork = network.ipv4.address.parse()?;
424        let ipv4_gateway: Ipv4Addr = network.ipv4.gateway.parse()?;
425        let ipv6_network: IpNetwork = network.ipv6.address.parse()?;
426        let ipv6_gateway: Ipv6Addr = network.ipv6.gateway.parse()?;
427
428        let mut links = handle
429            .link()
430            .get()
431            .match_name(network.link.clone())
432            .execute();
433        let Some(link) = links.try_next().await? else {
434            warn!("unable to find link named {}", network.link);
435            return Ok(());
436        };
437
438        handle
439            .address()
440            .add(link.header.index, ipv4_network.ip(), ipv4_network.prefix())
441            .execute()
442            .await?;
443
444        let ipv6_result = handle
445            .address()
446            .add(link.header.index, ipv6_network.ip(), ipv6_network.prefix())
447            .execute()
448            .await;
449
450        let ipv6_ready = match ipv6_result {
451            Ok(()) => true,
452            Err(error) => {
453                warn!("unable to setup ipv6 network: {}", error);
454                false
455            }
456        };
457
458        handle.link().set(link.header.index).up().execute().await?;
459
460        handle
461            .route()
462            .add()
463            .v4()
464            .destination_prefix(Ipv4Addr::UNSPECIFIED, 0)
465            .output_interface(link.header.index)
466            .gateway(ipv4_gateway)
467            .execute()
468            .await?;
469
470        if ipv6_ready {
471            let ipv6_gw_result = handle
472                .route()
473                .add()
474                .v6()
475                .destination_prefix(Ipv6Addr::UNSPECIFIED, 0)
476                .output_interface(link.header.index)
477                .gateway(ipv6_gateway)
478                .execute()
479                .await;
480
481            if let Err(error) = ipv6_gw_result {
482                warn!("failed to add ipv6 gateway route: {}", error);
483            }
484        }
485        Ok(())
486    }
487
488    async fn network_configure_ethtool(&mut self, network: &LaunchNetwork) -> Result<()> {
489        let mut handle = EthtoolHandle::new()?;
490        handle.set_gso(&network.link, false)?;
491        handle.set_tso(&network.link, false)?;
492        Ok(())
493    }
494
495    async fn run(
496        &mut self,
497        config: &Config,
498        launch: &LaunchInfo,
499        idm: IdmInternalClient,
500    ) -> Result<()> {
501        let mut cmd = match config.cmd() {
502            None => vec![],
503            Some(value) => value.clone(),
504        };
505
506        if launch.run.is_some() {
507            cmd.clone_from(launch.run.as_ref().unwrap());
508        }
509
510        if let Some(entrypoint) = config.entrypoint() {
511            for item in entrypoint.iter().rev() {
512                cmd.insert(0, item.to_string());
513            }
514        }
515
516        if cmd.is_empty() {
517            cmd.push("/bin/sh".to_string());
518        }
519
520        let path = cmd.remove(0);
521
522        let mut env = HashMap::new();
523        if let Some(config_env) = config.env() {
524            env.extend(GuestInit::env_map(config_env));
525        }
526        env.extend(launch.env.clone());
527        env.insert("KRATA_CONTAINER".to_string(), "1".to_string());
528
529        // If we were not provided a terminal definition in our launch manifest, we
530        // default to xterm as most terminal emulators support the xterm control codes.
531        if !env.contains_key("TERM") {
532            env.insert("TERM".to_string(), "xterm".to_string());
533        }
534
535        let path = resolve_executable(&env, path.into())?;
536        let Some(file_name) = path.file_name() else {
537            return Err(anyhow!("cannot get file name of command path"));
538        };
539        let Some(file_name) = file_name.to_str() else {
540            return Err(anyhow!("cannot get file name of command path as str"));
541        };
542        cmd.insert(0, file_name.to_string());
543        let env = GuestInit::env_list(env);
544
545        trace!("running guest command: {}", cmd.join(" "));
546
547        let path = CString::new(path.as_os_str().as_bytes())?;
548        let cmd = GuestInit::strings_as_cstrings(cmd)?;
549        let env = GuestInit::strings_as_cstrings(env)?;
550        let mut working_dir = config
551            .working_dir()
552            .as_ref()
553            .map(|x| x.to_string())
554            .unwrap_or("/".to_string());
555
556        if working_dir.is_empty() {
557            working_dir = "/".to_string();
558        }
559
560        let cgroup = self.init_cgroup().await?;
561        self.fork_and_exec(idm, cgroup, working_dir, path, cmd, env)
562            .await?;
563        Ok(())
564    }
565
566    async fn init_cgroup(&self) -> Result<Cgroup> {
567        trace!("initializing cgroup");
568        let hierarchy = cgroups_rs::hierarchies::auto();
569        let cgroup = Cgroup::new(hierarchy, "krata-guest-task")?;
570        cgroup.set_cgroup_type("threaded")?;
571        trace!("initialized cgroup");
572        Ok(cgroup)
573    }
574
575    fn strings_as_cstrings(values: Vec<String>) -> Result<Vec<CString>> {
576        let mut results: Vec<CString> = vec![];
577        for value in values {
578            results.push(CString::new(value.as_bytes().to_vec())?);
579        }
580        Ok(results)
581    }
582
583    fn env_map(env: &[String]) -> HashMap<String, String> {
584        let mut map = HashMap::<String, String>::new();
585        for item in env {
586            if let Some((key, value)) = item.split_once('=') {
587                map.insert(key.to_string(), value.to_string());
588            }
589        }
590        map
591    }
592
593    fn env_list(env: HashMap<String, String>) -> Vec<String> {
594        env.iter()
595            .map(|(key, value)| format!("{}={}", key, value))
596            .collect::<Vec<String>>()
597    }
598
599    async fn fork_and_exec(
600        &mut self,
601        idm: IdmInternalClient,
602        cgroup: Cgroup,
603        working_dir: String,
604        path: CString,
605        cmd: Vec<CString>,
606        env: Vec<CString>,
607    ) -> Result<()> {
608        match unsafe { fork()? } {
609            ForkResult::Parent { child } => self.background(idm, cgroup, child).await,
610            ForkResult::Child => self.foreground(cgroup, working_dir, path, cmd, env).await,
611        }
612    }
613
614    async fn foreground(
615        &mut self,
616        cgroup: Cgroup,
617        working_dir: String,
618        path: CString,
619        cmd: Vec<CString>,
620        env: Vec<CString>,
621    ) -> Result<()> {
622        GuestInit::set_controlling_terminal()?;
623        std::env::set_current_dir(working_dir)?;
624        cgroup.add_task(CgroupPid::from(std::process::id() as u64))?;
625        execve(&path, &cmd, &env)?;
626        Ok(())
627    }
628
629    fn set_controlling_terminal() -> Result<()> {
630        unsafe {
631            setsid();
632            set_controlling_terminal(io::stdin().as_raw_fd(), 0)?;
633        }
634        Ok(())
635    }
636
637    async fn background(
638        &mut self,
639        idm: IdmInternalClient,
640        cgroup: Cgroup,
641        executed: Pid,
642    ) -> Result<()> {
643        let mut background = GuestBackground::new(idm, cgroup, executed).await?;
644        background.run().await?;
645        Ok(())
646    }
647}
648
649pub fn resolve_executable(env: &HashMap<String, String>, path: PathBuf) -> Result<PathBuf> {
650    if path.is_absolute() {
651        return Ok(path);
652    }
653
654    if path.is_file() {
655        return Ok(path.absolutize()?.to_path_buf());
656    }
657
658    if let Some(path_var) = env.get("PATH") {
659        for item in path_var.split(':') {
660            let mut exe_path: PathBuf = item.into();
661            exe_path.push(&path);
662            if exe_path.is_file() {
663                return Ok(exe_path);
664            }
665        }
666    }
667    Ok(path)
668}