pub mod layout {
pub const PML4: u64 = 0x1000;
pub const PDPT: u64 = 0x2000;
pub const PD: u64 = 0x3000;
pub const GDT: u64 = 0x4000;
pub const BOOT_PARAMS: u64 = 0x1_0000;
pub const CMDLINE: u64 = 0x2_0000;
pub const KERNEL_LOAD: u64 = 0x10_0000; }
const BP_E820_ENTRIES: u64 = 0x1e8;
const BP_SETUP_HEADER: u64 = 0x1f1; const BP_E820_TABLE: u64 = 0x2d0;
const HDR_SETUP_SECTS: usize = 0x1f1;
const HDR_TYPE_OF_LOADER: u64 = 0x210;
const HDR_LOADFLAGS: u64 = 0x211;
const HDR_RAMDISK_IMAGE: u64 = 0x218;
const HDR_RAMDISK_SIZE: u64 = 0x21c;
const HDR_CMD_LINE_PTR: u64 = 0x228;
const HDR_INIT_SIZE: u64 = 0x260;
const SETUP_HEADER_END: u64 = 0x268;
const E820_RAM: u32 = 1;
const CR0_PE_PG: u64 = 0x8000_0001; const CR4_PAE: u64 = 0x0000_0020; const EFER_LME_LMA: u64 = 0x0000_0500;
const GDT_CODE64: u64 = 0x00af_9a00_0000_ffff; const GDT_DATA: u64 = 0x00cf_9200_0000_ffff;
pub const BOOT_CS: u16 = 0x10;
pub const BOOT_DS: u16 = 0x18;
pub struct BootConfig<'a> {
pub mem_size: usize,
pub cmdline: &'a str,
pub bzimage: &'a [u8],
pub initrd: Option<&'a [u8]>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Segment {
pub selector: u16,
pub base: u64,
pub limit: u32,
pub type_: u8,
pub dpl: u8,
pub s: u8,
pub present: u8,
pub l: u8,
pub db: u8,
pub g: u8,
}
#[derive(Clone, Copy, Debug)]
pub struct BootRegs {
pub cr0: u64,
pub cr3: u64,
pub cr4: u64,
pub efer: u64,
pub gdt_base: u64,
pub gdt_limit: u16,
pub cs: Segment,
pub ds: Segment,
pub rip: u64,
pub rsi: u64,
pub rflags: u64,
}
#[derive(Debug, PartialEq, Eq)]
pub enum BootError {
BzImageTooSmall { len: usize, need: usize },
SetupSectorsOutOfRange { pm_offset: usize, len: usize },
MemTooSmall { need: u64, have: usize },
RegionOverlap(&'static str),
CmdlineTooLong { len: usize, max: u64 },
}
impl std::fmt::Display for BootError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
BootError::BzImageTooSmall { len, need } => {
write!(f, "bzImage too small: {len} bytes, need >= {need}")
}
BootError::SetupSectorsOutOfRange { pm_offset, len } => {
write!(
f,
"setup sectors out of range: pm-kernel at +0x{pm_offset:x} past end ({len})"
)
}
BootError::MemTooSmall { need, have } => {
write!(
f,
"guest RAM too small: need 0x{need:x} bytes, have 0x{have:x}"
)
}
BootError::RegionOverlap(what) => write!(f, "boot region overlap: {what}"),
BootError::CmdlineTooLong { len, max } => {
write!(f, "cmdline too long: {len} bytes, max {max}")
}
}
}
}
impl std::error::Error for BootError {}
struct GuestWriter<'a> {
mem: &'a mut [u8],
}
impl GuestWriter<'_> {
fn write(&mut self, gpa: u64, bytes: &[u8]) -> Result<(), BootError> {
let start = gpa as usize;
let end = start
.checked_add(bytes.len())
.filter(|&e| e <= self.mem.len())
.ok_or(BootError::MemTooSmall {
need: gpa + bytes.len() as u64,
have: self.mem.len(),
})?;
self.mem[start..end].copy_from_slice(bytes);
Ok(())
}
fn write_u8(&mut self, gpa: u64, v: u8) -> Result<(), BootError> {
self.write(gpa, &[v])
}
fn write_u32(&mut self, gpa: u64, v: u32) -> Result<(), BootError> {
self.write(gpa, &v.to_le_bytes())
}
fn write_u64(&mut self, gpa: u64, v: u64) -> Result<(), BootError> {
self.write(gpa, &v.to_le_bytes())
}
}
fn initrd_load_addr(
mem_size: u64,
kernel_footprint: u64,
initrd_len: u64,
) -> Result<u64, BootError> {
const PAGE: u64 = 0x1000;
let floor = (layout::KERNEL_LOAD + kernel_footprint).next_multiple_of(PAGE);
match mem_size.checked_sub(initrd_len).map(|a| a & !(PAGE - 1)) {
Some(addr) if addr >= floor => Ok(addr),
_ => Err(BootError::MemTooSmall {
need: floor + initrd_len,
have: mem_size as usize,
}),
}
}
pub fn setup_boot(mem: &mut [u8], cfg: &BootConfig) -> Result<BootRegs, BootError> {
use layout::*;
debug_assert_eq!(
mem.len(),
cfg.mem_size,
"mem slice must be exactly mem_size"
);
let bz = cfg.bzimage;
if (bz.len() as u64) < SETUP_HEADER_END {
return Err(BootError::BzImageTooSmall {
len: bz.len(),
need: SETUP_HEADER_END as usize,
});
}
let setup_sects = if bz[HDR_SETUP_SECTS] == 0 {
4u8
} else {
bz[HDR_SETUP_SECTS]
};
let pm_offset = (setup_sects as usize + 1) * 512;
if pm_offset >= bz.len() {
return Err(BootError::SetupSectorsOutOfRange {
pm_offset,
len: bz.len(),
});
}
let pm_kernel = &bz[pm_offset..];
let kernel_end = KERNEL_LOAD + pm_kernel.len() as u64;
let init_size = u32::from_le_bytes(
bz[HDR_INIT_SIZE as usize..HDR_INIT_SIZE as usize + 4]
.try_into()
.unwrap(),
) as u64;
let kernel_footprint = (pm_kernel.len() as u64).max(init_size);
let mut high_water = kernel_end;
let initrd_addr = match cfg.initrd {
Some(rd) => {
let addr = initrd_load_addr(cfg.mem_size as u64, kernel_footprint, rd.len() as u64)?;
high_water = high_water.max(addr + rd.len() as u64);
Some(addr)
}
None => None,
};
if high_water > cfg.mem_size as u64 {
return Err(BootError::MemTooSmall {
need: high_water,
have: cfg.mem_size,
});
}
let cmdline_max = KERNEL_LOAD - CMDLINE;
if cfg.cmdline.len() as u64 + 1 > cmdline_max {
return Err(BootError::CmdlineTooLong {
len: cfg.cmdline.len(),
max: cmdline_max,
});
}
let mut w = GuestWriter { mem };
w.write_u64(PML4, PDPT | 0x3)?; w.write_u64(PDPT, PD | 0x3)?;
for i in 0..512u64 {
w.write_u64(PD + i * 8, (i * 0x20_0000) | 0x83)?; }
w.write_u64(GDT, 0)?;
w.write_u64(GDT + 8, 0)?;
w.write_u64(GDT + 16, GDT_CODE64)?;
w.write_u64(GDT + 24, GDT_DATA)?;
w.write(
BOOT_PARAMS + BP_SETUP_HEADER,
&bz[BP_SETUP_HEADER as usize..SETUP_HEADER_END as usize],
)?;
w.write_u8(BOOT_PARAMS + HDR_TYPE_OF_LOADER, 0xff)?; let loadflags = bz[HDR_LOADFLAGS as usize] | 0x01;
w.write_u8(BOOT_PARAMS + HDR_LOADFLAGS, loadflags)?;
match (cfg.initrd, initrd_addr) {
(Some(rd), Some(addr)) => {
w.write_u32(BOOT_PARAMS + HDR_RAMDISK_IMAGE, addr as u32)?;
w.write_u32(BOOT_PARAMS + HDR_RAMDISK_SIZE, rd.len() as u32)?;
}
_ => {
w.write_u32(BOOT_PARAMS + HDR_RAMDISK_IMAGE, 0)?;
w.write_u32(BOOT_PARAMS + HDR_RAMDISK_SIZE, 0)?;
}
}
w.write_u32(BOOT_PARAMS + HDR_CMD_LINE_PTR, CMDLINE as u32)?;
w.write(CMDLINE, cfg.cmdline.as_bytes())?;
w.write_u8(CMDLINE + cfg.cmdline.len() as u64, 0)?;
let mut e820 = |slot: u64, addr: u64, size: u64, typ: u32| -> Result<(), BootError> {
let base = BOOT_PARAMS + BP_E820_TABLE + slot * 20;
w.write_u64(base, addr)?;
w.write_u64(base + 8, size)?;
w.write_u32(base + 16, typ)?;
Ok(())
};
e820(0, 0x0, 0x9_fc00, E820_RAM)?;
e820(1, KERNEL_LOAD, cfg.mem_size as u64 - KERNEL_LOAD, E820_RAM)?;
w.write_u8(BOOT_PARAMS + BP_E820_ENTRIES, 2)?;
w.write(KERNEL_LOAD, pm_kernel)?;
if let (Some(rd), Some(addr)) = (cfg.initrd, initrd_addr) {
w.write(addr, rd)?;
}
let cs = Segment {
selector: BOOT_CS,
base: 0,
limit: 0xffff_ffff,
type_: 0b1011, dpl: 0,
s: 1,
present: 1,
l: 1,
db: 0,
g: 1,
};
let ds = Segment {
selector: BOOT_DS,
base: 0,
limit: 0xffff_ffff,
type_: 0b0011, dpl: 0,
s: 1,
present: 1,
l: 0,
db: 1,
g: 1,
};
Ok(BootRegs {
cr0: CR0_PE_PG,
cr3: PML4,
cr4: CR4_PAE,
efer: EFER_LME_LMA,
gdt_base: GDT,
gdt_limit: 4 * 8 - 1,
cs,
ds,
rip: KERNEL_LOAD + 0x200,
rsi: BOOT_PARAMS,
rflags: 0x2, })
}
#[cfg(test)]
mod tests {
use super::layout::*;
use super::*;
fn fake_bzimage(setup_sects: u8, pm_len: usize) -> Vec<u8> {
let pm_offset = (setup_sects as usize + 1) * 512;
let mut bz = vec![0u8; pm_offset + pm_len];
bz[HDR_SETUP_SECTS] = setup_sects;
bz[HDR_LOADFLAGS as usize] = 0x80;
for (i, b) in bz[pm_offset..].iter_mut().enumerate() {
*b = (i as u8) ^ 0xa5;
}
bz
}
fn rd_u32(mem: &[u8], gpa: u64) -> u32 {
let s = gpa as usize;
u32::from_le_bytes(mem[s..s + 4].try_into().unwrap())
}
fn rd_u64(mem: &[u8], gpa: u64) -> u64 {
let s = gpa as usize;
u64::from_le_bytes(mem[s..s + 8].try_into().unwrap())
}
#[test]
fn builds_long_mode_entry_state() {
let bz = fake_bzimage(4, 4096);
let mem_size = 512 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "console=ttyS0 root=/dev/vda",
bzimage: &bz,
initrd: None,
};
let regs = setup_boot(&mut mem, &cfg).expect("setup_boot");
assert_eq!(regs.cr0, 0x8000_0001);
assert_eq!(regs.cr4, 0x20);
assert_eq!(regs.efer, 0x500);
assert_eq!(regs.cr3, PML4);
assert_eq!(regs.rip, KERNEL_LOAD + 0x200);
assert_eq!(regs.rsi, BOOT_PARAMS);
assert_eq!(regs.gdt_base, GDT);
assert_eq!(regs.gdt_limit, 31);
assert_eq!(regs.cs.selector, BOOT_CS);
assert_eq!(regs.cs.l, 1, "CS must be a 64-bit code segment");
assert_eq!(regs.ds.selector, BOOT_DS);
assert_eq!(regs.ds.db, 1);
}
#[test]
fn writes_page_tables_gdt_and_zero_page() {
let bz = fake_bzimage(4, 8192);
let mem_size = 256 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "console=ttyS0",
bzimage: &bz,
initrd: None,
};
setup_boot(&mut mem, &cfg).unwrap();
assert_eq!(rd_u64(&mem, PML4), PDPT | 0x3);
assert_eq!(rd_u64(&mem, PDPT), PD | 0x3);
assert_eq!(rd_u64(&mem, PD), 0x83);
assert_eq!(rd_u64(&mem, PD + 511 * 8), (511 * 0x20_0000) | 0x83);
assert_eq!(rd_u64(&mem, GDT + 16), 0x00af_9a00_0000_ffff);
assert_eq!(rd_u64(&mem, GDT + 24), 0x00cf_9200_0000_ffff);
assert_eq!(mem[(BOOT_PARAMS + 0x210) as usize], 0xff); assert_eq!(mem[(BOOT_PARAMS + 0x211) as usize], 0x81); assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x228), CMDLINE as u32); assert_eq!(mem[(BOOT_PARAMS + 0x1e8) as usize], 2);
assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0), 0);
assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 8), 0x9_fc00);
assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x2d0 + 16), 1);
assert_eq!(rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 20), KERNEL_LOAD);
assert_eq!(
rd_u64(&mem, BOOT_PARAMS + 0x2d0 + 28),
mem_size as u64 - KERNEL_LOAD
);
assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x218), 0);
assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x21c), 0);
let c = b"console=ttyS0";
assert_eq!(&mem[CMDLINE as usize..CMDLINE as usize + c.len()], c);
assert_eq!(mem[CMDLINE as usize + c.len()], 0);
}
#[test]
fn copies_pm_kernel_to_one_mib() {
let bz = fake_bzimage(2, 1024);
let pm_offset = (2 + 1) * 512;
let mem_size = 64 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "x",
bzimage: &bz,
initrd: None,
};
setup_boot(&mut mem, &cfg).unwrap();
assert_eq!(
&mem[KERNEL_LOAD as usize..KERNEL_LOAD as usize + 1024],
&bz[pm_offset..pm_offset + 1024]
);
}
#[test]
fn places_initramfs_and_sets_ramdisk_fields() {
let bz = fake_bzimage(4, 4096);
let initrd = vec![0x5au8; 4096];
let mem_size = 128 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "console=ttyS0",
bzimage: &bz,
initrd: Some(&initrd),
};
setup_boot(&mut mem, &cfg).unwrap();
let addr = rd_u32(&mem, BOOT_PARAMS + 0x218) as usize;
assert_eq!(rd_u32(&mem, BOOT_PARAMS + 0x21c), initrd.len() as u32);
assert_eq!(&mem[addr..addr + initrd.len()], &initrd[..]);
assert!(addr as u64 > KERNEL_LOAD);
assert_eq!(addr & 0xfff, 0);
assert!(addr + initrd.len() <= mem_size);
}
#[test]
fn initrd_placed_clear_of_kernel_footprint() {
let mut bz = fake_bzimage(4, 4096);
let init_size: u32 = 200 * 1024 * 1024; bz[HDR_INIT_SIZE as usize..HDR_INIT_SIZE as usize + 4]
.copy_from_slice(&init_size.to_le_bytes());
let initrd = vec![0x5au8; 4096];
let mem_size = 512 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "console=ttyS0",
bzimage: &bz,
initrd: Some(&initrd),
};
setup_boot(&mut mem, &cfg).unwrap();
let addr = rd_u32(&mem, BOOT_PARAMS + 0x218) as u64;
assert!(
addr >= KERNEL_LOAD + init_size as u64,
"initrd at {addr:#x} must clear the {init_size:#x}-byte kernel footprint"
);
}
#[test]
fn rejects_truncated_bzimage() {
let bz = vec![0u8; 0x100]; let mut mem = vec![0u8; 64 * 1024 * 1024];
let cfg = BootConfig {
mem_size: mem.len(),
cmdline: "x",
bzimage: &bz,
initrd: None,
};
assert!(matches!(
setup_boot(&mut mem, &cfg),
Err(BootError::BzImageTooSmall { .. })
));
}
#[test]
fn rejects_setup_sects_past_end() {
let mut bz = vec![0u8; 0x268];
bz[HDR_SETUP_SECTS] = 200; let mut mem = vec![0u8; 64 * 1024 * 1024];
let cfg = BootConfig {
mem_size: mem.len(),
cmdline: "x",
bzimage: &bz,
initrd: None,
};
assert!(matches!(
setup_boot(&mut mem, &cfg),
Err(BootError::SetupSectorsOutOfRange { .. })
));
}
#[test]
fn rejects_ram_too_small_for_kernel() {
let bz = fake_bzimage(4, 8 * 1024 * 1024);
let mem_size = 4 * 1024 * 1024; let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "x",
bzimage: &bz,
initrd: None,
};
assert!(matches!(
setup_boot(&mut mem, &cfg),
Err(BootError::MemTooSmall { .. })
));
}
#[test]
fn rejects_initramfs_past_end_of_ram() {
let bz = fake_bzimage(4, 4096);
let mem_size = 8 * 1024 * 1024;
let initrd = vec![0u8; mem_size];
let mut mem = vec![0u8; mem_size];
let cfg = BootConfig {
mem_size,
cmdline: "x",
bzimage: &bz,
initrd: Some(&initrd),
};
assert!(matches!(
setup_boot(&mut mem, &cfg),
Err(BootError::MemTooSmall { .. })
));
}
#[test]
fn rejects_overlong_cmdline() {
let bz = fake_bzimage(4, 4096);
let mem_size = 64 * 1024 * 1024;
let mut mem = vec![0u8; mem_size];
let huge = "a".repeat((KERNEL_LOAD - CMDLINE) as usize);
let cfg = BootConfig {
mem_size,
cmdline: &huge,
bzimage: &bz,
initrd: None,
};
assert!(matches!(
setup_boot(&mut mem, &cfg),
Err(BootError::CmdlineTooLong { .. })
));
}
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(4096))]
#[test]
fn initrd_never_overlaps_kernel_footprint(
mem_size in 0u64..=64 * 1024 * 1024 * 1024,
footprint in 0u64..=8 * 1024 * 1024 * 1024,
initrd_len in 0u64..=8 * 1024 * 1024 * 1024,
) {
let floor = (KERNEL_LOAD + footprint).next_multiple_of(0x1000);
match initrd_load_addr(mem_size, footprint, initrd_len) {
Ok(addr) => {
prop_assert_eq!(addr & 0xfff, 0, "page-aligned");
prop_assert!(addr >= floor, "clears the kernel footprint");
prop_assert!(addr + initrd_len <= mem_size, "fits in RAM");
}
Err(_) => {
prop_assert!(floor + initrd_len > mem_size);
}
}
}
}
}