1use anyhow::{anyhow, Result};
2use cgroups_rs::{Cgroup, CgroupPid};
3use futures::stream::TryStreamExt;
4use ipnetwork::IpNetwork;
5use krata::ethtool::EthtoolHandle;
6use krata::idm::client::IdmInternalClient;
7use krata::idm::internal::INTERNAL_IDM_CHANNEL;
8use krata::launchcfg::{LaunchInfo, LaunchNetwork, LaunchPackedFormat};
9use libc::{sethostname, setsid, TIOCSCTTY};
10use log::{trace, warn};
11use nix::ioctl_write_int_bad;
12use nix::unistd::{dup2, execve, fork, ForkResult, Pid};
13use oci_spec::image::{Config, ImageConfiguration};
14use path_absolutize::Absolutize;
15use platform_info::{PlatformInfo, PlatformInfoAPI, UNameAPI};
16use std::collections::HashMap;
17use std::ffi::CString;
18use std::fs::{File, OpenOptions, Permissions};
19use std::io;
20use std::net::{Ipv4Addr, Ipv6Addr};
21use std::os::fd::AsRawFd;
22use std::os::unix::ffi::OsStrExt;
23use std::os::unix::fs::{chroot, PermissionsExt};
24use std::path::{Path, PathBuf};
25use std::str::FromStr;
26use sys_mount::{FilesystemType, Mount, MountFlags};
27use tokio::fs;
28
29use crate::background::GuestBackground;
30
31const IMAGE_BLOCK_DEVICE_PATH: &str = "/dev/xvda";
32const CONFIG_BLOCK_DEVICE_PATH: &str = "/dev/xvdb";
33
34const IMAGE_MOUNT_PATH: &str = "/image";
35const CONFIG_MOUNT_PATH: &str = "/config";
36const OVERLAY_MOUNT_PATH: &str = "/overlay";
37
38const OVERLAY_IMAGE_BIND_PATH: &str = "/overlay/image";
39const OVERLAY_WORK_PATH: &str = "/overlay/work";
40const OVERLAY_UPPER_PATH: &str = "/overlay/upper";
41
42const SYS_PATH: &str = "/sys";
43const PROC_PATH: &str = "/proc";
44const DEV_PATH: &str = "/dev";
45
46const NEW_ROOT_PATH: &str = "/newroot";
47const NEW_ROOT_SYS_PATH: &str = "/newroot/sys";
48const NEW_ROOT_PROC_PATH: &str = "/newroot/proc";
49const NEW_ROOT_DEV_PATH: &str = "/newroot/dev";
50
51const IMAGE_CONFIG_JSON_PATH: &str = "/config/image/config.json";
52const LAUNCH_CONFIG_JSON_PATH: &str = "/config/launch.json";
53
54const ADDONS_DEVICE_PATH: &str = "/dev/xvdc";
55const ADDONS_MOUNT_PATH: &str = "/addons";
56const ADDONS_MODULES_PATH: &str = "/addons/modules";
57
58ioctl_write_int_bad!(set_controlling_terminal, TIOCSCTTY);
59
60pub struct GuestInit {}
61
62impl Default for GuestInit {
63 fn default() -> Self {
64 Self::new()
65 }
66}
67
68impl GuestInit {
69 pub fn new() -> GuestInit {
70 GuestInit {}
71 }
72
73 pub async fn init(&mut self) -> Result<()> {
74 self.early_init().await?;
75
76 trace!("opening console descriptor");
77 match OpenOptions::new()
78 .read(true)
79 .write(true)
80 .open("/dev/console")
81 {
82 Ok(console) => self.map_console(&console)?,
83 Err(error) => warn!("failed to open console: {}", error),
84 };
85
86 let idm = IdmInternalClient::open(INTERNAL_IDM_CHANNEL, "/dev/hvc1")
87 .await
88 .map_err(|x| anyhow!("failed to open idm client: {}", x))?;
89 self.mount_config_image().await?;
90
91 let config = self.parse_image_config().await?;
92 let launch = self.parse_launch_config().await?;
93
94 self.mount_root_image(launch.root.format.clone()).await?;
95
96 self.mount_addons().await?;
97
98 self.mount_new_root().await?;
99 self.mount_kernel_modules().await?;
100 self.bind_new_root().await?;
101
102 if let Some(hostname) = launch.hostname.clone() {
103 let result = unsafe {
104 sethostname(
105 hostname.as_bytes().as_ptr() as *mut libc::c_char,
106 hostname.len(),
107 )
108 };
109 if result != 0 {
110 warn!("failed to set hostname: {}", result);
111 }
112
113 let etc = PathBuf::from_str("/etc")?;
114 if !etc.exists() {
115 fs::create_dir(&etc).await?;
116 }
117 let mut etc_hostname = etc;
118 etc_hostname.push("hostname");
119 fs::write(&etc_hostname, hostname + "\n").await?;
120 }
121
122 if let Some(network) = &launch.network {
123 trace!("initializing network");
124 if let Err(error) = self.network_setup(&launch, network).await {
125 warn!("failed to initialize network: {}", error);
126 }
127 }
128
129 if let Some(cfg) = config.config() {
130 trace!("running guest task");
131 self.run(cfg, &launch, idm).await?;
132 } else {
133 return Err(anyhow!(
134 "unable to determine what to execute, image config doesn't tell us"
135 ));
136 }
137 Ok(())
138 }
139
140 async fn early_init(&mut self) -> Result<()> {
141 trace!("early init");
142 self.create_dir("/dev", Some(0o0755)).await?;
143 self.create_dir("/proc", None).await?;
144 self.create_dir("/sys", Some(0o0555)).await?;
145 self.create_dir("/root", Some(0o0700)).await?;
146 self.create_dir("/tmp", None).await?;
147 self.create_dir("/run", Some(0o0755)).await?;
148 self.mount_kernel_fs("devtmpfs", "/dev", "mode=0755", None, None)
149 .await?;
150 self.mount_kernel_fs("proc", "/proc", "", None, None)
151 .await?;
152 self.mount_kernel_fs("sysfs", "/sys", "", None, None)
153 .await?;
154 self.create_dir("/dev/pts", Some(0o0755)).await?;
155 self.mount_kernel_fs("devpts", "/dev/pts", "", None, Some("/dev/ptmx"))
156 .await?;
157 fs::symlink("/proc/self/fd", "/dev/fd").await?;
158 fs::symlink("/proc/self/fd/0", "/dev/stdin").await?;
159 fs::symlink("/proc/self/fd/1", "/dev/stdout").await?;
160 fs::symlink("/proc/self/fd/2", "/dev/stderr").await?;
161 self.mount_kernel_fs(
162 "cgroup2",
163 "/sys/fs/cgroup",
164 "",
165 Some(MountFlags::RELATIME),
166 None,
167 )
168 .await?;
169 Ok(())
170 }
171
172 async fn mount_addons(&mut self) -> Result<()> {
173 if !fs::try_exists(ADDONS_DEVICE_PATH).await? {
174 return Ok(());
175 }
176
177 self.mount_image(
178 &PathBuf::from(ADDONS_DEVICE_PATH),
179 &PathBuf::from(ADDONS_MOUNT_PATH),
180 LaunchPackedFormat::Squashfs,
181 )
182 .await?;
183 Ok(())
184 }
185
186 async fn mount_kernel_modules(&mut self) -> Result<()> {
187 if !fs::try_exists(ADDONS_MODULES_PATH).await? {
188 return Ok(());
189 }
190
191 let Some(platform_info) = PlatformInfo::new().ok() else {
192 return Ok(());
193 };
194
195 let kernel_release = platform_info.release().to_string_lossy().to_string();
196 let modules_path = format!("/newroot/lib/modules/{}", kernel_release);
197 fs::create_dir_all(&modules_path).await?;
198 Mount::builder()
199 .fstype(FilesystemType::Manual("none"))
200 .flags(MountFlags::BIND | MountFlags::RDONLY)
201 .mount(ADDONS_MODULES_PATH, modules_path)?;
202 Ok(())
203 }
204
205 async fn create_dir(&mut self, path: &str, mode: Option<u32>) -> Result<()> {
206 let path = Path::new(path);
207 if !path.is_dir() {
208 trace!("creating directory {:?}", path);
209 fs::create_dir(path).await?;
210 }
211 if let Some(mode) = mode {
212 let permissions = Permissions::from_mode(mode);
213 trace!("setting directory {:?} permissions to {:?}", path, mode);
214 fs::set_permissions(path, permissions).await?;
215 }
216 Ok(())
217 }
218
219 async fn mount_kernel_fs(
220 &mut self,
221 fstype: &str,
222 path: &str,
223 data: &str,
224 flags: Option<MountFlags>,
225 source: Option<&str>,
226 ) -> Result<()> {
227 trace!("mounting kernel fs {} to {}", fstype, path);
228 Mount::builder()
229 .fstype(FilesystemType::Manual(fstype))
230 .flags(MountFlags::NOEXEC | MountFlags::NOSUID | flags.unwrap_or(MountFlags::empty()))
231 .data(data)
232 .mount(source.unwrap_or(fstype), path)?;
233 Ok(())
234 }
235
236 fn map_console(&mut self, console: &File) -> Result<()> {
237 trace!("mapping console");
238 dup2(console.as_raw_fd(), 0)?;
239 dup2(console.as_raw_fd(), 1)?;
240 dup2(console.as_raw_fd(), 2)?;
241 Ok(())
242 }
243
244 async fn mount_config_image(&mut self) -> Result<()> {
245 trace!("mounting config image");
246 let config_mount_path = Path::new(CONFIG_MOUNT_PATH);
247 self.mount_image(
248 Path::new(CONFIG_BLOCK_DEVICE_PATH),
249 config_mount_path,
250 LaunchPackedFormat::Squashfs,
251 )
252 .await?;
253 Ok(())
254 }
255
256 async fn mount_root_image(&mut self, format: LaunchPackedFormat) -> Result<()> {
257 trace!("mounting root image");
258 let image_mount_path = Path::new(IMAGE_MOUNT_PATH);
259 self.mount_image(Path::new(IMAGE_BLOCK_DEVICE_PATH), image_mount_path, format)
260 .await?;
261 Ok(())
262 }
263
264 async fn mount_image(
265 &mut self,
266 from: &Path,
267 to: &Path,
268 format: LaunchPackedFormat,
269 ) -> Result<()> {
270 trace!("mounting {:?} image {:?} to {:?}", format, from, to);
271 if !to.is_dir() {
272 fs::create_dir(to).await?;
273 }
274 Mount::builder()
275 .fstype(FilesystemType::Manual(match format {
276 LaunchPackedFormat::Squashfs => "squashfs",
277 LaunchPackedFormat::Erofs => "erofs",
278 }))
279 .flags(MountFlags::RDONLY)
280 .mount(from, to)?;
281 Ok(())
282 }
283
284 async fn mount_move_subtree(&mut self, from: &Path, to: &Path) -> Result<()> {
285 trace!("moving subtree {:?} to {:?}", from, to);
286 if !to.is_dir() {
287 fs::create_dir(to).await?;
288 }
289 Mount::builder()
290 .fstype(FilesystemType::Manual("none"))
291 .flags(MountFlags::MOVE)
292 .mount(from, to)?;
293 Ok(())
294 }
295
296 async fn mount_new_root(&mut self) -> Result<()> {
297 trace!("mounting new root");
298 self.mount_overlay_tmpfs().await?;
299 self.bind_image_to_overlay_tmpfs().await?;
300 self.mount_overlay_to_new_root().await?;
301 std::env::set_current_dir(NEW_ROOT_PATH)?;
302 trace!("mounted new root");
303 Ok(())
304 }
305
306 async fn mount_overlay_tmpfs(&mut self) -> Result<()> {
307 fs::create_dir(OVERLAY_MOUNT_PATH).await?;
308 Mount::builder()
309 .fstype(FilesystemType::Manual("tmpfs"))
310 .mount("tmpfs", OVERLAY_MOUNT_PATH)?;
311 fs::create_dir(OVERLAY_UPPER_PATH).await?;
312 fs::create_dir(OVERLAY_WORK_PATH).await?;
313 Ok(())
314 }
315
316 async fn bind_image_to_overlay_tmpfs(&mut self) -> Result<()> {
317 fs::create_dir(OVERLAY_IMAGE_BIND_PATH).await?;
318 Mount::builder()
319 .fstype(FilesystemType::Manual("none"))
320 .flags(MountFlags::BIND | MountFlags::RDONLY)
321 .mount(IMAGE_MOUNT_PATH, OVERLAY_IMAGE_BIND_PATH)?;
322 Ok(())
323 }
324
325 async fn mount_overlay_to_new_root(&mut self) -> Result<()> {
326 fs::create_dir(NEW_ROOT_PATH).await?;
327 Mount::builder()
328 .fstype(FilesystemType::Manual("overlay"))
329 .flags(MountFlags::NOATIME)
330 .data(&format!(
331 "lowerdir={},upperdir={},workdir={}",
332 OVERLAY_IMAGE_BIND_PATH, OVERLAY_UPPER_PATH, OVERLAY_WORK_PATH
333 ))
334 .mount(format!("overlayfs:{}", OVERLAY_MOUNT_PATH), NEW_ROOT_PATH)?;
335 Ok(())
336 }
337
338 async fn parse_image_config(&mut self) -> Result<ImageConfiguration> {
339 let image_config_path = Path::new(IMAGE_CONFIG_JSON_PATH);
340 let content = fs::read_to_string(image_config_path).await?;
341 let config = serde_json::from_str(&content)?;
342 Ok(config)
343 }
344
345 async fn parse_launch_config(&mut self) -> Result<LaunchInfo> {
346 trace!("parsing launch config");
347 let launch_config = Path::new(LAUNCH_CONFIG_JSON_PATH);
348 let content = fs::read_to_string(launch_config).await?;
349 Ok(serde_json::from_str(&content)?)
350 }
351
352 async fn bind_new_root(&mut self) -> Result<()> {
353 self.mount_move_subtree(Path::new(SYS_PATH), Path::new(NEW_ROOT_SYS_PATH))
354 .await?;
355 self.mount_move_subtree(Path::new(PROC_PATH), Path::new(NEW_ROOT_PROC_PATH))
356 .await?;
357 self.mount_move_subtree(Path::new(DEV_PATH), Path::new(NEW_ROOT_DEV_PATH))
358 .await?;
359 trace!("binding new root");
360 Mount::builder()
361 .fstype(FilesystemType::Manual("none"))
362 .flags(MountFlags::BIND)
363 .mount(".", "/")?;
364 trace!("chrooting into new root");
365 chroot(".")?;
366 trace!("setting root as current directory");
367 std::env::set_current_dir("/")?;
368 Ok(())
369 }
370
371 async fn network_setup(&mut self, cfg: &LaunchInfo, network: &LaunchNetwork) -> Result<()> {
372 trace!("setting up network for link");
373
374 let etc = PathBuf::from_str("/etc")?;
375 if !etc.exists() {
376 fs::create_dir(etc).await?;
377 }
378 let resolv = PathBuf::from_str("/etc/resolv.conf")?;
379
380 {
381 let mut lines = vec!["# krata resolver configuration".to_string()];
382 for nameserver in &network.resolver.nameservers {
383 lines.push(format!("nameserver {}", nameserver));
384 }
385
386 let mut conf = lines.join("\n");
387 conf.push('\n');
388 fs::write(resolv, conf).await?;
389 }
390
391 let hosts = PathBuf::from_str("/etc/hosts")?;
392 if let Some(ref hostname) = cfg.hostname {
393 let mut lines = if hosts.exists() {
394 fs::read_to_string(&hosts)
395 .await?
396 .lines()
397 .map(|x| x.to_string())
398 .collect::<Vec<_>>()
399 } else {
400 vec!["127.0.0.1 localhost".to_string()]
401 };
402 lines.push(format!("127.0.1.1 {}", hostname));
403 fs::write(&hosts, lines.join("\n") + "\n").await?;
404 }
405
406 self.network_configure_ethtool(network).await?;
407 self.network_configure_link(network).await?;
408 Ok(())
409 }
410
411 async fn network_configure_link(&mut self, network: &LaunchNetwork) -> Result<()> {
412 let (connection, handle, _) = rtnetlink::new_connection()?;
413 tokio::spawn(connection);
414
415 let mut links = handle.link().get().match_name("lo".to_string()).execute();
416 let Some(link) = links.try_next().await? else {
417 warn!("unable to find link named lo");
418 return Ok(());
419 };
420
421 handle.link().set(link.header.index).up().execute().await?;
422
423 let ipv4_network: IpNetwork = network.ipv4.address.parse()?;
424 let ipv4_gateway: Ipv4Addr = network.ipv4.gateway.parse()?;
425 let ipv6_network: IpNetwork = network.ipv6.address.parse()?;
426 let ipv6_gateway: Ipv6Addr = network.ipv6.gateway.parse()?;
427
428 let mut links = handle
429 .link()
430 .get()
431 .match_name(network.link.clone())
432 .execute();
433 let Some(link) = links.try_next().await? else {
434 warn!("unable to find link named {}", network.link);
435 return Ok(());
436 };
437
438 handle
439 .address()
440 .add(link.header.index, ipv4_network.ip(), ipv4_network.prefix())
441 .execute()
442 .await?;
443
444 let ipv6_result = handle
445 .address()
446 .add(link.header.index, ipv6_network.ip(), ipv6_network.prefix())
447 .execute()
448 .await;
449
450 let ipv6_ready = match ipv6_result {
451 Ok(()) => true,
452 Err(error) => {
453 warn!("unable to setup ipv6 network: {}", error);
454 false
455 }
456 };
457
458 handle.link().set(link.header.index).up().execute().await?;
459
460 handle
461 .route()
462 .add()
463 .v4()
464 .destination_prefix(Ipv4Addr::UNSPECIFIED, 0)
465 .output_interface(link.header.index)
466 .gateway(ipv4_gateway)
467 .execute()
468 .await?;
469
470 if ipv6_ready {
471 let ipv6_gw_result = handle
472 .route()
473 .add()
474 .v6()
475 .destination_prefix(Ipv6Addr::UNSPECIFIED, 0)
476 .output_interface(link.header.index)
477 .gateway(ipv6_gateway)
478 .execute()
479 .await;
480
481 if let Err(error) = ipv6_gw_result {
482 warn!("failed to add ipv6 gateway route: {}", error);
483 }
484 }
485 Ok(())
486 }
487
488 async fn network_configure_ethtool(&mut self, network: &LaunchNetwork) -> Result<()> {
489 let mut handle = EthtoolHandle::new()?;
490 handle.set_gso(&network.link, false)?;
491 handle.set_tso(&network.link, false)?;
492 Ok(())
493 }
494
495 async fn run(
496 &mut self,
497 config: &Config,
498 launch: &LaunchInfo,
499 idm: IdmInternalClient,
500 ) -> Result<()> {
501 let mut cmd = match config.cmd() {
502 None => vec![],
503 Some(value) => value.clone(),
504 };
505
506 if launch.run.is_some() {
507 cmd.clone_from(launch.run.as_ref().unwrap());
508 }
509
510 if let Some(entrypoint) = config.entrypoint() {
511 for item in entrypoint.iter().rev() {
512 cmd.insert(0, item.to_string());
513 }
514 }
515
516 if cmd.is_empty() {
517 cmd.push("/bin/sh".to_string());
518 }
519
520 let path = cmd.remove(0);
521
522 let mut env = HashMap::new();
523 if let Some(config_env) = config.env() {
524 env.extend(GuestInit::env_map(config_env));
525 }
526 env.extend(launch.env.clone());
527 env.insert("KRATA_CONTAINER".to_string(), "1".to_string());
528
529 if !env.contains_key("TERM") {
532 env.insert("TERM".to_string(), "xterm".to_string());
533 }
534
535 let path = resolve_executable(&env, path.into())?;
536 let Some(file_name) = path.file_name() else {
537 return Err(anyhow!("cannot get file name of command path"));
538 };
539 let Some(file_name) = file_name.to_str() else {
540 return Err(anyhow!("cannot get file name of command path as str"));
541 };
542 cmd.insert(0, file_name.to_string());
543 let env = GuestInit::env_list(env);
544
545 trace!("running guest command: {}", cmd.join(" "));
546
547 let path = CString::new(path.as_os_str().as_bytes())?;
548 let cmd = GuestInit::strings_as_cstrings(cmd)?;
549 let env = GuestInit::strings_as_cstrings(env)?;
550 let mut working_dir = config
551 .working_dir()
552 .as_ref()
553 .map(|x| x.to_string())
554 .unwrap_or("/".to_string());
555
556 if working_dir.is_empty() {
557 working_dir = "/".to_string();
558 }
559
560 let cgroup = self.init_cgroup().await?;
561 self.fork_and_exec(idm, cgroup, working_dir, path, cmd, env)
562 .await?;
563 Ok(())
564 }
565
566 async fn init_cgroup(&self) -> Result<Cgroup> {
567 trace!("initializing cgroup");
568 let hierarchy = cgroups_rs::hierarchies::auto();
569 let cgroup = Cgroup::new(hierarchy, "krata-guest-task")?;
570 cgroup.set_cgroup_type("threaded")?;
571 trace!("initialized cgroup");
572 Ok(cgroup)
573 }
574
575 fn strings_as_cstrings(values: Vec<String>) -> Result<Vec<CString>> {
576 let mut results: Vec<CString> = vec![];
577 for value in values {
578 results.push(CString::new(value.as_bytes().to_vec())?);
579 }
580 Ok(results)
581 }
582
583 fn env_map(env: &[String]) -> HashMap<String, String> {
584 let mut map = HashMap::<String, String>::new();
585 for item in env {
586 if let Some((key, value)) = item.split_once('=') {
587 map.insert(key.to_string(), value.to_string());
588 }
589 }
590 map
591 }
592
593 fn env_list(env: HashMap<String, String>) -> Vec<String> {
594 env.iter()
595 .map(|(key, value)| format!("{}={}", key, value))
596 .collect::<Vec<String>>()
597 }
598
599 async fn fork_and_exec(
600 &mut self,
601 idm: IdmInternalClient,
602 cgroup: Cgroup,
603 working_dir: String,
604 path: CString,
605 cmd: Vec<CString>,
606 env: Vec<CString>,
607 ) -> Result<()> {
608 match unsafe { fork()? } {
609 ForkResult::Parent { child } => self.background(idm, cgroup, child).await,
610 ForkResult::Child => self.foreground(cgroup, working_dir, path, cmd, env).await,
611 }
612 }
613
614 async fn foreground(
615 &mut self,
616 cgroup: Cgroup,
617 working_dir: String,
618 path: CString,
619 cmd: Vec<CString>,
620 env: Vec<CString>,
621 ) -> Result<()> {
622 GuestInit::set_controlling_terminal()?;
623 std::env::set_current_dir(working_dir)?;
624 cgroup.add_task(CgroupPid::from(std::process::id() as u64))?;
625 execve(&path, &cmd, &env)?;
626 Ok(())
627 }
628
629 fn set_controlling_terminal() -> Result<()> {
630 unsafe {
631 setsid();
632 set_controlling_terminal(io::stdin().as_raw_fd(), 0)?;
633 }
634 Ok(())
635 }
636
637 async fn background(
638 &mut self,
639 idm: IdmInternalClient,
640 cgroup: Cgroup,
641 executed: Pid,
642 ) -> Result<()> {
643 let mut background = GuestBackground::new(idm, cgroup, executed).await?;
644 background.run().await?;
645 Ok(())
646 }
647}
648
649pub fn resolve_executable(env: &HashMap<String, String>, path: PathBuf) -> Result<PathBuf> {
650 if path.is_absolute() {
651 return Ok(path);
652 }
653
654 if path.is_file() {
655 return Ok(path.absolutize()?.to_path_buf());
656 }
657
658 if let Some(path_var) = env.get("PATH") {
659 for item in path_var.split(':') {
660 let mut exe_path: PathBuf = item.into();
661 exe_path.push(&path);
662 if exe_path.is_file() {
663 return Ok(exe_path);
664 }
665 }
666 }
667 Ok(path)
668}