use alloc::{
string::{String, ToString},
vec::Vec,
};
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
use core::ptr::NonNull;
use super::vm_fdt::{FdtWriter, FdtWriterNode};
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
use ax_memory_addr::MemoryAddr;
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64", test))]
use axaddrspace::GuestPhysAddr;
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64", test))]
use axvm::VMMemoryRegion;
use axvm::config::AxVMCrateConfig;
use fdt_parser::{Fdt, Node};
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
use crate::vmm::{VMRef, images::load_vm_image_from_memory};
fn should_skip_guest_cpu_prop(prop_name: &str) -> bool {
matches!(
prop_name,
"riscv,cbop-block-size" | "riscv,cboz-block-size" | "riscv,cbom-block-size"
)
}
pub fn crate_guest_fdt(
fdt: &Fdt,
passthrough_device_names: &[String],
crate_config: &AxVMCrateConfig,
) -> Vec<u8> {
let mut fdt_writer = FdtWriter::new().unwrap();
let mut previous_node_level = 0;
let mut node_stack: Vec<FdtWriterNode> = Vec::new();
let phys_cpu_ids = crate_config
.base
.phys_cpu_ids
.clone()
.expect("ERROR: phys_cpu_ids is None");
let all_nodes: Vec<Node> = fdt.all_nodes().collect();
let all_paths = super::build_all_node_paths(&all_nodes);
for (index, node) in all_nodes.iter().enumerate() {
let node_path = &all_paths[index];
let node_action = determine_node_action(node, node_path, passthrough_device_names);
match node_action {
NodeAction::RootNode => {
node_stack.push(fdt_writer.begin_node("").unwrap());
}
NodeAction::CpuNode => {
let need = need_cpu_node(&phys_cpu_ids, node, node_path);
if need {
handle_node_level_change(
&mut fdt_writer,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(fdt_writer.begin_node(node.name()).unwrap());
} else {
continue;
}
}
NodeAction::Skip => {
continue;
}
_ => {
trace!(
"Found exact passthrough device node: {}, path: {}",
node.name(),
node_path
);
handle_node_level_change(
&mut fdt_writer,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(fdt_writer.begin_node(node.name()).unwrap());
}
}
previous_node_level = node.level;
for prop in node.propertys() {
if node_path.starts_with("/cpus") && should_skip_guest_cpu_prop(prop.name) {
continue;
}
fdt_writer.property(prop.name, prop.raw_value()).unwrap();
}
}
while let Some(node) = node_stack.pop() {
previous_node_level -= 1;
fdt_writer.end_node(node).unwrap();
}
assert_eq!(previous_node_level, 0);
fdt_writer.finish().unwrap()
}
enum NodeAction {
Skip,
RootNode,
CpuNode,
IncludeAsPassthroughDevice,
IncludeAsChildNode,
IncludeAsAncestorNode,
}
fn determine_node_action(
node: &Node,
node_path: &str,
passthrough_device_names: &[String],
) -> NodeAction {
if node.name() == "/" {
NodeAction::RootNode
} else if node.name().starts_with("memory") {
NodeAction::Skip
} else if node_path.starts_with("/cpus") {
NodeAction::CpuNode
} else if passthrough_device_names.contains(&node_path.to_string()) {
NodeAction::IncludeAsPassthroughDevice
}
else if is_descendant_of_passthrough_device(node_path, node.level, passthrough_device_names) {
NodeAction::IncludeAsChildNode
}
else if is_ancestor_of_passthrough_device(node_path, passthrough_device_names) {
NodeAction::IncludeAsAncestorNode
} else {
NodeAction::Skip
}
}
fn is_descendant_of_passthrough_device(
node_path: &str,
node_level: usize,
passthrough_device_names: &[String],
) -> bool {
for passthrough_path in passthrough_device_names {
if node_path.starts_with(passthrough_path) && node_path.len() > passthrough_path.len() {
if passthrough_path == "/" || node_path.chars().nth(passthrough_path.len()) == Some('/')
{
let expected_parent_level = passthrough_path.matches('/').count();
let current_node_level = node_level;
if (passthrough_path == "/" && current_node_level >= 2)
|| (passthrough_path != "/" && current_node_level > expected_parent_level)
{
return true;
}
}
}
}
false
}
fn handle_node_level_change(
fdt_writer: &mut FdtWriter,
node_stack: &mut Vec<FdtWriterNode>,
current_level: usize,
previous_level: usize,
) {
if current_level <= previous_level {
for _ in current_level..=previous_level {
if let Some(end_node) = node_stack.pop() {
fdt_writer.end_node(end_node).unwrap();
}
}
}
}
fn is_ancestor_of_passthrough_device(node_path: &str, passthrough_device_names: &[String]) -> bool {
for passthrough_path in passthrough_device_names {
if passthrough_path.starts_with(node_path) && passthrough_path.len() > node_path.len() {
let next_char = passthrough_path.chars().nth(node_path.len()).unwrap_or(' ');
if next_char == '/' || node_path == "/" {
return true;
}
}
}
false
}
fn need_cpu_node(phys_cpu_ids: &[usize], node: &Node, node_path: &str) -> bool {
if !node_path.starts_with("/cpus/cpu@") {
return true;
}
if let Some(cpu_id) = node_path
.strip_prefix("/cpus/cpu@")
.and_then(|rest| rest.split('/').next())
.and_then(|id| usize::from_str_radix(id, 16).ok())
&& phys_cpu_ids.contains(&cpu_id)
{
return true;
}
if let Some(mut cpu_reg) = node.reg()
&& let Some(reg_entry) = cpu_reg.next()
{
let cpu_address = reg_entry.address as usize;
debug!(
"Checking CPU node {} with address 0x{:x}",
node.name(),
cpu_address
);
if phys_cpu_ids.contains(&cpu_address) {
debug!(
"CPU node {} with address 0x{:x} is in phys_cpu_ids, including in guest FDT",
node.name(),
cpu_address
);
return true;
}
debug!(
"CPU node {} with address 0x{:x} is NOT in phys_cpu_ids, skipping",
node.name(),
cpu_address
);
}
false
}
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64", test))]
fn add_memory_node(
new_memory: &[VMMemoryRegion],
crate_config: &AxVMCrateConfig,
new_fdt: &mut FdtWriter,
) {
let configured_region_count = if crate_config.kernel.configured_memory_region_count == 0 {
crate_config.kernel.memory_regions.len()
} else {
crate_config
.kernel
.configured_memory_region_count
.min(crate_config.kernel.memory_regions.len())
};
if new_memory.len() != crate_config.kernel.memory_regions.len() {
warn!(
"VM memory region count {} does not match config region count {}; filtering /memory by zipped order",
new_memory.len(),
crate_config.kernel.memory_regions.len()
);
}
let mut new_value: Vec<u32> = Vec::new();
for (mem, _cfg) in new_memory.iter().take(configured_region_count).zip(
crate_config
.kernel
.memory_regions
.iter()
.take(configured_region_count),
) {
let gpa = mem.gpa.as_usize() as u64;
let size = mem.size() as u64;
new_value.push((gpa >> 32) as u32);
new_value.push((gpa & 0xFFFFFFFF) as u32);
new_value.push((size >> 32) as u32);
new_value.push((size & 0xFFFFFFFF) as u32);
}
info!("Adding memory node with value: {new_value:x?}");
new_fdt
.property_array_u32("reg", new_value.as_ref())
.unwrap();
new_fdt.property_string("device_type", "memory").unwrap();
}
#[cfg(any(target_arch = "aarch64", test))]
fn initrd_range_from_image_config(
ramdisk: Option<&axvm::config::RamdiskInfo>,
) -> Option<(u64, u64)> {
let rd = ramdisk?;
let start = rd.load_gpa.as_usize() as u64;
let size = rd.size? as u64;
Some((start, start + size))
}
#[cfg(any(target_arch = "aarch64", test))]
fn sanitize_bootargs(bootargs: &str) -> String {
const RAMDISK_BOOTARGS: [&str; 3] = ["root=/dev/ram0", "rdinit=/init", "rootwait"];
const FSCK_REPAIR_BOOTARG: &str = "fsck.repair=yes";
let rewritten = bootargs.replace(" ro ", " rw ");
let tokens = rewritten.split_whitespace().collect::<Vec<_>>();
let has_fsck_policy = tokens.iter().any(|token| {
matches!(
*token,
"fastboot"
| "fsck.mode=skip"
| "forcefsck"
| "fsck.mode=force"
| "fsckfix"
| "fsck.repair=yes"
| "fsck.repair=no"
)
});
let has_block_root = tokens.iter().any(|token| {
token.starts_with("root=/dev/")
|| token.starts_with("root=PARTLABEL=")
|| token.starts_with("root=LABEL=")
|| token.starts_with("root=UUID=")
|| token.starts_with("root=PARTUUID=")
});
let mut sanitized = Vec::with_capacity(tokens.len());
let mut index = 0;
while index < tokens.len() {
if tokens[index..].starts_with(&RAMDISK_BOOTARGS) {
index += RAMDISK_BOOTARGS.len();
continue;
}
sanitized.push(tokens[index]);
index += 1;
}
if has_block_root && !has_fsck_policy {
sanitized.push(FSCK_REPAIR_BOOTARG);
}
sanitized.join(" ")
}
#[cfg(target_arch = "aarch64")]
pub fn update_fdt(
fdt_src: NonNull<u8>,
dtb_size: usize,
vm: VMRef,
crate_config: &AxVMCrateConfig,
) {
let mut new_fdt = FdtWriter::new().unwrap();
let mut previous_node_level = 0;
let mut node_stack: Vec<FdtWriterNode> = Vec::new();
let initrd_range = vm
.with_config(|config| initrd_range_from_image_config(config.image_config.ramdisk.as_ref()));
let fdt_bytes = unsafe { core::slice::from_raw_parts(fdt_src.as_ptr(), dtb_size) };
let fdt = Fdt::from_bytes(fdt_bytes)
.map_err(|e| format!("Failed to parse FDT: {e:#?}"))
.expect("Failed to parse FDT");
for node in fdt.all_nodes() {
if node.name() == "/" {
node_stack.push(new_fdt.begin_node("").unwrap());
} else if node.name().starts_with("memory") {
continue;
} else {
handle_node_level_change(
&mut new_fdt,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(new_fdt.begin_node(node.name()).unwrap());
}
previous_node_level = node.level;
if node.name() == "chosen" {
for prop in node.propertys() {
if prop.name.starts_with("linux,initrd-") {
if initrd_range.is_some() {
info!(
"Skipping property: {}, belonging to node: {}",
prop.name,
node.name()
);
} else {
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
} else if prop.name == "bootargs" {
let bootargs_str = prop.str();
let modified_bootargs = sanitize_bootargs(bootargs_str);
if modified_bootargs != bootargs_str {
debug!(
"Modifying bootargs: {} -> {}",
bootargs_str, modified_bootargs
);
}
new_fdt
.property_string(prop.name, &modified_bootargs)
.unwrap();
} else {
debug!(
"Find property: {}, belonging to node: {}",
prop.name,
node.name()
);
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
}
if let Some((initrd_start, initrd_end)) = initrd_range {
info!(
"initrd_start: {:x}, initrd_end: {:x}",
initrd_start, initrd_end
);
new_fdt
.property_u64("linux,initrd-start", initrd_start)
.unwrap();
new_fdt
.property_u64("linux,initrd-end", initrd_end)
.unwrap();
}
} else {
for prop in node.propertys() {
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
}
}
while let Some(node) = node_stack.pop() {
previous_node_level -= 1;
new_fdt.end_node(node).unwrap();
if previous_node_level == 1 {
let memory_regions = vm.memory_regions();
let memory_node = new_fdt.begin_node("memory").unwrap();
add_memory_node(&memory_regions, crate_config, &mut new_fdt);
new_fdt.end_node(memory_node).unwrap();
}
}
assert_eq!(previous_node_level, 0);
info!("Updating FDT memory successfully");
let new_fdt_bytes = new_fdt.finish().unwrap();
let vm_clone = vm.clone();
let dest_addr = calculate_dtb_load_addr(vm, new_fdt_bytes.len());
debug!(
"New FDT will be loaded at {:x}, size: 0x{:x}",
dest_addr,
new_fdt_bytes.len()
);
load_vm_image_from_memory(&new_fdt_bytes, dest_addr, vm_clone)
.expect("Failed to load VM images");
}
#[cfg(target_arch = "riscv64")]
pub fn update_fdt(
fdt_src: NonNull<u8>,
dtb_size: usize,
vm: VMRef,
crate_config: &AxVMCrateConfig,
) {
let fdt_bytes = unsafe { core::slice::from_raw_parts(fdt_src.as_ptr(), dtb_size) };
let fdt = Fdt::from_bytes(fdt_bytes)
.map_err(|e| format!("Failed to parse cached guest FDT: {e:#?}"))
.expect("Failed to parse cached guest FDT");
let host_fdt = Fdt::from_bytes(super::get_host_fdt())
.map_err(|e| format!("Failed to parse host FDT while updating guest FDT: {e:#?}"))
.expect("Failed to parse host FDT while updating guest FDT");
let new_fdt_bytes =
patch_guest_fdt_for_runtime(&fdt, &vm.memory_regions(), crate_config, &host_fdt);
let dest_addr = calculate_dtb_load_addr(vm.clone(), new_fdt_bytes.len());
load_vm_image_from_memory(&new_fdt_bytes, dest_addr, vm).expect("Failed to load VM images");
}
#[cfg(test)]
mod tests {
use super::{initrd_range_from_image_config, sanitize_bootargs};
use axaddrspace::GuestPhysAddr;
use axvm::config::RamdiskInfo;
#[test]
fn initrd_range_requires_both_address_and_size() {
assert_eq!(
initrd_range_from_image_config(Some(&RamdiskInfo {
load_gpa: GuestPhysAddr::from(0xa000_0000usize),
size: None,
})),
None
);
assert_eq!(
initrd_range_from_image_config(Some(&RamdiskInfo {
load_gpa: GuestPhysAddr::from(0xa000_0000usize),
size: Some(0x1234),
})),
Some((0xa000_0000, 0xa000_1234))
);
}
#[test]
fn sanitize_bootargs_enables_auto_repair_for_block_roots() {
let bootargs = "root=/dev/mmcblk0p2 rw console=ttyS2,1500000 rootwait rootfstype=ext4";
assert_eq!(
sanitize_bootargs(bootargs),
"root=/dev/mmcblk0p2 rw console=ttyS2,1500000 rootwait rootfstype=ext4 \
fsck.repair=yes"
);
}
#[test]
fn sanitize_bootargs_preserves_existing_fsck_policy() {
let bootargs =
"root=/dev/mmcblk0p2 ro rootwait rootfstype=ext4 fsckfix rdinit=/init root=/dev/ram0";
assert_eq!(
sanitize_bootargs(bootargs),
"root=/dev/mmcblk0p2 rw rootwait rootfstype=ext4 fsckfix"
);
}
}
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
pub(crate) fn calculate_dtb_load_addr(vm: VMRef, fdt_size: usize) -> GuestPhysAddr {
const MB: usize = 1024 * 1024;
let main_memory = vm
.memory_regions()
.first()
.cloned()
.expect("VM must have at least one memory region");
vm.with_config(|config| {
let dtb_addr = if let Some(addr) = config.image_config.dtb_load_gpa
&& !main_memory.is_identical()
{
addr
} else {
let main_memory_size = main_memory.size().min(512 * MB);
let addr = (main_memory.gpa + main_memory_size - fdt_size).align_down(2 * MB);
if fdt_size > main_memory_size {
error!("DTB size is larger than available memory");
}
addr
};
config.image_config.dtb_load_gpa = Some(dtb_addr);
dtb_addr
})
}
#[cfg(target_arch = "riscv64")]
pub(crate) fn patch_guest_fdt_for_runtime(
fdt: &Fdt,
memory_regions: &[VMMemoryRegion],
crate_config: &AxVMCrateConfig,
host_fdt: &Fdt,
) -> Vec<u8> {
let mut new_fdt = FdtWriter::new().unwrap();
let mut previous_node_level = 0usize;
let mut node_stack: Vec<FdtWriterNode> = Vec::new();
let mut has_chosen = false;
for node in fdt.all_nodes() {
if node.name().starts_with("memory") {
continue;
}
if node.name() == "chosen" {
has_chosen = true;
}
if node.name() == "/" {
node_stack.push(new_fdt.begin_node("").unwrap());
} else {
handle_node_level_change(
&mut new_fdt,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(new_fdt.begin_node(node.name()).unwrap());
}
previous_node_level = node.level;
for prop in node.propertys() {
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
}
while node_stack.len() > 1 {
let node = node_stack.pop().unwrap();
new_fdt.end_node(node).unwrap();
}
assert_eq!(node_stack.len(), 1);
if !has_chosen && let Some(chosen_node) = host_fdt.find_nodes("/chosen").next() {
let chosen = new_fdt.begin_node("chosen").unwrap();
for prop in chosen_node.propertys() {
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
new_fdt.end_node(chosen).unwrap();
}
let memory_node = new_fdt.begin_node("memory").unwrap();
add_memory_node(memory_regions, crate_config, &mut new_fdt);
new_fdt.end_node(memory_node).unwrap();
let root = node_stack.pop().unwrap();
new_fdt.end_node(root).unwrap();
new_fdt.finish().unwrap()
}
#[cfg(target_arch = "aarch64")]
pub fn update_cpu_node(fdt: &Fdt, host_fdt: &Fdt, crate_config: &AxVMCrateConfig) -> Vec<u8> {
let mut new_fdt = FdtWriter::new().unwrap();
let mut previous_node_level = 0;
let mut node_stack: Vec<FdtWriterNode> = Vec::new();
let phys_cpu_ids = crate_config
.base
.phys_cpu_ids
.clone()
.expect("ERROR: phys_cpu_ids is None");
let fdt_all_nodes: Vec<Node> = fdt.all_nodes().collect();
let host_fdt_all_nodes: Vec<Node> = host_fdt.all_nodes().collect();
let fdt_all_paths = super::build_all_node_paths(&fdt_all_nodes);
let host_fdt_all_paths = super::build_all_node_paths(&host_fdt_all_nodes);
for (index, node) in fdt_all_nodes.iter().enumerate() {
let node_path = &fdt_all_paths[index];
if node.name() == "/" {
node_stack.push(new_fdt.begin_node("").unwrap());
} else if node_path.starts_with("/cpus") {
continue;
} else {
handle_node_level_change(
&mut new_fdt,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(new_fdt.begin_node(node.name()).unwrap());
}
previous_node_level = node.level;
for prop in node.propertys() {
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
}
for (index, node) in host_fdt_all_nodes.iter().enumerate() {
let node_path = &host_fdt_all_paths[index];
if node_path.starts_with("/cpus") {
let need = need_cpu_node(&phys_cpu_ids, node, &node_path);
if need {
handle_node_level_change(
&mut new_fdt,
&mut node_stack,
node.level,
previous_node_level,
);
node_stack.push(new_fdt.begin_node(node.name()).unwrap());
for prop in node.propertys() {
if should_skip_guest_cpu_prop(prop.name) {
continue;
}
new_fdt.property(prop.name, prop.raw_value()).unwrap();
}
previous_node_level = node.level;
}
}
}
while let Some(node) = node_stack.pop() {
previous_node_level -= 1;
new_fdt.end_node(node).unwrap();
}
assert_eq!(previous_node_level, 0);
new_fdt.finish().unwrap()
}