use alloc::{collections::BTreeMap, format, sync::Arc, vec::Vec};
use core::sync::atomic::{AtomicUsize, Ordering};
use ax_errno::{AxResult, ax_err_type};
use ax_kspin::SpinNoIrq as Mutex;
#[cfg(target_arch = "riscv64")]
use riscv_vcpu::GprIndex as RiscvGprIndex;
use crate::{
AsVCpuTask, AxVCpuExitReason, CpuMask, GuestPhysAddr, VCpuState, VCpuTask,
runtime::{VCpuRef, VMRef, sub_running_vm_count},
};
const KERNEL_STACK_SIZE: usize = 0x40000;
static VM_VCPU_TASKS: Mutex<BTreeMap<usize, Arc<VMVCpus>>> = Mutex::new(BTreeMap::new());
fn get_vm_vcpus(vm_id: usize) -> Option<Arc<VMVCpus>> {
VM_VCPU_TASKS.lock().get(&vm_id).cloned()
}
pub struct VMVCpus {
_vm_id: usize,
wait_queue: crate::WaitQueue,
vcpu_task_list: Mutex<BTreeMap<usize, crate::AxTaskRef>>,
pending_interrupts: Mutex<BTreeMap<usize, Vec<usize>>>,
running_halting_vcpu_count: AtomicUsize,
}
impl VMVCpus {
fn new(vm: VMRef) -> Self {
Self {
_vm_id: vm.id(),
wait_queue: crate::WaitQueue::new(),
vcpu_task_list: Mutex::new(BTreeMap::new()),
pending_interrupts: Mutex::new(BTreeMap::new()),
running_halting_vcpu_count: AtomicUsize::new(0),
}
}
fn add_vcpu_task(&self, vcpu_id: usize, vcpu_task: crate::AxTaskRef) {
self.vcpu_task_list.lock().insert(vcpu_id, vcpu_task);
self.pending_interrupts.lock().entry(vcpu_id).or_default();
}
fn queue_interrupt(&self, vcpu_id: usize, vector: usize) -> AxResult<usize> {
let task = self
.vcpu_task_list
.lock()
.get(&vcpu_id)
.cloned()
.ok_or_else(|| ax_err_type!(NotFound, format!("vCPU {vcpu_id} task not found")))?;
let mut pending = self.pending_interrupts.lock();
pending.entry(vcpu_id).or_default().push(vector);
Ok(task.cpu_id() as usize)
}
fn drain_pending_interrupts(&self, vcpu_id: usize) -> Vec<usize> {
let mut pending = self.pending_interrupts.lock();
pending
.get_mut(&vcpu_id)
.map(core::mem::take)
.unwrap_or_default()
}
fn wait(&self) {
self.wait_queue.wait()
}
fn wait_until<F>(&self, condition: F)
where
F: Fn() -> bool,
{
self.wait_queue.wait_until(condition)
}
#[allow(dead_code)]
fn notify_one(&self) {
self.wait_queue.notify_one(false);
}
fn notify_all(&self) {
self.wait_queue.notify_all(false);
}
fn mark_vcpu_running(&self) {
self.running_halting_vcpu_count
.fetch_add(1, Ordering::Relaxed);
}
fn mark_vcpu_exiting(&self) -> bool {
self.running_halting_vcpu_count.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|count| count.checked_sub(1),
) == Ok(1)
}
}
fn wait(vm_vcpus: &VMVCpus) {
vm_vcpus.wait();
}
fn wait_for<F>(vm_vcpus: &VMVCpus, condition: F)
where
F: Fn() -> bool,
{
vm_vcpus.wait_until(condition);
}
pub(crate) fn notify_primary_vcpu(vm_id: usize) {
if let Some(vm_vcpus) = get_vm_vcpus(vm_id) {
vm_vcpus.notify_one();
} else {
warn!("VM[{vm_id}] vCPU resources not found");
}
}
pub(crate) fn notify_all_vcpus(vm_id: usize) {
if let Some(vm_vcpus) = get_vm_vcpus(vm_id) {
vm_vcpus.notify_all();
}
}
pub(crate) fn queue_interrupt(vm_id: usize, vcpu_id: usize, vector: usize) -> AxResult {
let vm_vcpus = get_vm_vcpus(vm_id)
.ok_or_else(|| ax_err_type!(NotFound, format!("VM[{vm_id}] vCPU resources not found")))?;
let cpu_id = vm_vcpus.queue_interrupt(vcpu_id, vector)?;
vm_vcpus.notify_all();
crate::host::task::send_ipi(cpu_id);
Ok(())
}
pub(crate) fn inject_pending_interrupts(vm_id: usize, vcpu_id: usize, vcpu: &VCpuRef) {
let Some(vm_vcpus) = get_vm_vcpus(vm_id) else {
warn!("VM[{vm_id}] vCPU resources not found, cannot drain VCpu[{vcpu_id}] interrupts");
return;
};
for vector in vm_vcpus.drain_pending_interrupts(vcpu_id) {
trace!("Injecting queued interrupt {vector:#x} into VM[{vm_id}] VCpu[{vcpu_id}]");
if let Err(err) = vcpu.inject_interrupt(vector) {
warn!(
"Failed to inject queued interrupt {vector:#x} into VM[{vm_id}] VCpu[{vcpu_id}]: \
{err:?}"
);
}
}
}
fn ipi_targets(
vm: &VMRef,
current_vcpu_id: usize,
target_cpu: u64,
target_cpu_aux: u64,
send_to_all: bool,
send_to_self: bool,
) -> CpuMask<64> {
let mut targets = CpuMask::new();
if send_to_all {
for vcpu in vm.vcpu_list() {
if vcpu.id() != current_vcpu_id {
targets.set(vcpu.id(), true);
}
}
} else if send_to_self {
targets.set(current_vcpu_id, true);
} else {
#[cfg(target_arch = "aarch64")]
{
for (vcpu_id, _, phys_id) in vm.get_vcpu_affinities_pcpu_ids() {
let affinity = phys_id as u64;
let aff0 = affinity & 0xff;
let aff123 = affinity & !0xff;
if aff123 == target_cpu && aff0 < 16 && (target_cpu_aux & (1u64 << aff0)) != 0 {
targets.set(vcpu_id, true);
}
}
}
#[cfg(not(target_arch = "aarch64"))]
{
let _ = target_cpu_aux;
targets.set(target_cpu as usize, true);
}
}
targets
}
pub(crate) fn cleanup_vm_vcpus(vm_id: usize) {
if let Some(vm_vcpus) = VM_VCPU_TASKS.lock().remove(&vm_id) {
let tasks: Vec<_> = vm_vcpus.vcpu_task_list.lock().values().cloned().collect();
let task_count = tasks.len();
info!("VM[{}] Joining {} VCpu tasks...", vm_id, task_count);
for (idx, task) in tasks.iter().enumerate() {
debug!(
"VM[{}] Joining VCpu task[{}]: {}",
vm_id,
idx,
task.id_name()
);
let exit_code = task.join();
debug!(
"VM[{}] VCpu task[{}] exited with code: {}",
vm_id, idx, exit_code
);
}
info!(
"VM[{}] VCpu resources cleaned up, {} VCpu tasks joined successfully",
vm_id, task_count
);
} else {
warn!("VM[{}] VCpu resources not found in queue", vm_id);
}
}
fn mark_vcpu_running(vm_id: usize) {
if let Some(vm_vcpus) = get_vm_vcpus(vm_id) {
vm_vcpus.mark_vcpu_running();
}
}
fn vcpu_on(vm: VMRef, vcpu_id: usize, entry_point: GuestPhysAddr, arg: usize) -> AxResult {
let vcpu = vm
.vcpu_list()
.get(vcpu_id)
.cloned()
.ok_or_else(|| ax_err_type!(NotFound, format!("vCPU {vcpu_id} not found")))?;
if vcpu.state() != VCpuState::Free {
return Err(ax_err_type!(
BadState,
format!("vCPU {} invalid state {:?}", vcpu.id(), vcpu.state())
));
}
vcpu.set_entry(entry_point)?;
#[cfg(not(target_arch = "riscv64"))]
vcpu.set_gpr(0, arg);
#[cfg(target_arch = "riscv64")]
{
info!(
"vcpu_on: vcpu[{}] entry={:x} opaque={:x}",
vcpu_id, entry_point, arg
);
vcpu.set_gpr(RiscvGprIndex::A0 as usize, vcpu_id);
vcpu.set_gpr(RiscvGprIndex::A1 as usize, arg);
}
let vm_vcpus = get_vm_vcpus(vm.id()).ok_or_else(|| {
ax_err_type!(
NotFound,
format!("VM[{}] vCPU resources not found", vm.id())
)
})?;
let vcpu_task = alloc_vcpu_task(&vm, vcpu);
vm_vcpus.add_vcpu_task(vcpu_id, vcpu_task);
Ok(())
}
pub fn setup_vm_primary_vcpu(vm: VMRef) {
info!("Initializing VM[{}]'s {} vcpus", vm.id(), vm.vcpu_num());
let vm_id = vm.id();
let primary_vcpu_id = 0;
let Some(primary_vcpu) = vm.vcpu_list().get(primary_vcpu_id).cloned() else {
warn!("VM[{vm_id}] has no primary vCPU");
return;
};
let vm_vcpus = Arc::new(VMVCpus::new(vm.clone()));
{
let mut vm_vcpu_tasks = VM_VCPU_TASKS.lock();
if vm_vcpu_tasks.contains_key(&vm_id) {
debug!("VM[{vm_id}] vCPU resources already exist");
return;
}
vm_vcpu_tasks.insert(vm_id, vm_vcpus.clone());
}
let primary_vcpu_task = alloc_vcpu_task(&vm, primary_vcpu);
vm_vcpus.add_vcpu_task(0, primary_vcpu_task);
}
fn alloc_vcpu_task(vm: &VMRef, vcpu: VCpuRef) -> crate::AxTaskRef {
info!("Spawning task for VM[{}] VCpu[{}]", vm.id(), vcpu.id());
let mut vcpu_task = crate::TaskInner::new(
vcpu_run,
format!("VM[{}]-VCpu[{}]", vm.id(), vcpu.id()),
KERNEL_STACK_SIZE,
);
if let Some(phys_cpu_set) = vcpu.phys_cpu_set() {
vcpu_task.set_cpumask(crate::host::task::cpu_mask_from_raw_bits(phys_cpu_set));
}
let inner = VCpuTask::new(vm, vcpu);
*vcpu_task.task_ext_mut() = Some(crate::AxTaskExt::from_impl(inner));
info!(
"VCpu task {} created {:?}",
vcpu_task.id_name(),
vcpu_task.cpumask()
);
crate::host::task::spawn_task(vcpu_task)
}
fn vcpu_run() {
let curr = crate::host::task::current_task();
let vm = curr.as_vcpu_task().vm();
let vcpu = curr.as_vcpu_task().vcpu.clone();
let vm_id = vm.id();
let vcpu_id = vcpu.id();
let Some(vm_vcpus) = get_vm_vcpus(vm_id) else {
warn!("VM[{vm_id}] vCPU resources not found, VCpu[{vcpu_id}] exiting");
return;
};
info!("VM[{}] VCpu[{}] waiting for running", vm.id(), vcpu.id());
wait_for(&vm_vcpus, || vm.running());
info!("VM[{}] VCpu[{}] running...", vm.id(), vcpu.id());
#[cfg(target_arch = "x86_64")]
super::x86_irq::enable_ioapic_irq_forwarding(&vm, &vcpu);
mark_vcpu_running(vm_id);
loop {
inject_pending_interrupts(vm_id, vcpu_id, &vcpu);
#[cfg(target_arch = "x86_64")]
super::x86_irq::drain_pending_ioapic_irqs(&vm, &vcpu);
match vm.run_vcpu(vcpu_id) {
Ok(exit_reason) => match exit_reason {
AxVCpuExitReason::Hypercall { nr, args } => {
debug!("Hypercall [{nr}] args {args:x?}");
use crate::runtime::hvc::HyperCall;
match HyperCall::new(vm.clone(), nr, args) {
Ok(hypercall) => {
let ret_val = match hypercall.execute() {
Ok(ret_val) => ret_val as isize,
Err(err) => {
warn!("Hypercall [{nr:#x}] failed: {err:?}");
-1
}
};
vcpu.set_return_value(ret_val as usize);
}
Err(err) => {
warn!("Hypercall [{nr:#x}] failed: {err:?}");
}
}
}
AxVCpuExitReason::FailEntry {
hardware_entry_failure_reason,
} => {
warn!(
"VM[{vm_id}] VCpu[{vcpu_id}] run failed with exit code \
{hardware_entry_failure_reason}"
);
}
AxVCpuExitReason::ExternalInterrupt { vector } => {
debug!("VM[{vm_id}] run VCpu[{vcpu_id}] get irq {vector}");
#[cfg(not(any(target_arch = "aarch64", target_arch = "riscv64")))]
crate::host::arceos::dispatch_host_irq(vector as usize);
#[cfg(target_arch = "riscv64")]
vcpu.with_current_cpu_set(|| {
crate::host::arceos::dispatch_host_irq(vector as usize);
vcpu.get_arch_vcpu().latch_hvip_from_hw();
});
crate::check_timer_events();
#[cfg(target_arch = "x86_64")]
super::x86_irq::forward_passthrough_irq_from_vmexit(
&vm,
&vcpu,
vector as usize,
);
#[cfg(target_arch = "x86_64")]
super::x86_irq::inject_pending_serial_irq(&vm, &vcpu);
}
AxVCpuExitReason::PreemptionTimer => {
crate::timer::check_events();
#[cfg(target_arch = "x86_64")]
super::x86_irq::inject_due_pit_irq0(&vm, &vcpu);
#[cfg(target_arch = "x86_64")]
super::x86_irq::inject_pending_serial_irq(&vm, &vcpu);
}
AxVCpuExitReason::InterruptEnd { vector: _vector } => {
#[cfg(target_arch = "x86_64")]
if let Some(vector) = _vector {
super::x86_irq::inject_pending_ioapic_irq_after_eoi(&vm, &vcpu, vector);
}
}
AxVCpuExitReason::Halt => {
debug!("VM[{vm_id}] run VCpu[{vcpu_id}] Halt");
#[cfg(target_arch = "x86_64")]
super::x86_irq::inject_pending_serial_irq(&vm, &vcpu);
#[cfg(target_arch = "x86_64")]
continue;
#[cfg(not(target_arch = "x86_64"))]
wait(&vm_vcpus)
}
AxVCpuExitReason::Nothing => {}
AxVCpuExitReason::CpuDown { _state } => {
warn!("VM[{vm_id}] run VCpu[{vcpu_id}] CpuDown state {_state:#x}");
wait(&vm_vcpus)
}
AxVCpuExitReason::CpuUp {
target_cpu,
entry_point,
arg,
} => {
info!(
"VM[{vm_id}]'s VCpu[{vcpu_id}] try to boot target_cpu [{target_cpu}] \
entry_point={entry_point:x} arg={arg:#x}"
);
let vcpu_mappings = vm.get_vcpu_affinities_pcpu_ids();
let Some(target_vcpu_id) =
vcpu_mappings.iter().find_map(|(vcpu_id, _, phys_id)| {
(*phys_id == target_cpu as usize).then_some(*vcpu_id)
})
else {
warn!("Physical CPU ID {target_cpu} not found in VM configuration");
vcpu.set_return_value(usize::MAX);
continue;
};
match vcpu_on(vm.clone(), target_vcpu_id, entry_point, arg as _) {
Ok(()) => {
#[cfg(not(target_arch = "riscv64"))]
vcpu.set_gpr(0, 0);
#[cfg(target_arch = "riscv64")]
vcpu.set_gpr(RiscvGprIndex::A0 as usize, 0);
}
Err(err) => {
warn!("Failed to boot VM[{vm_id}] VCpu[{target_vcpu_id}]: {err:?}");
vcpu.set_return_value(usize::MAX);
}
}
}
AxVCpuExitReason::SystemDown => {
warn!("VM[{vm_id}] run VCpu[{vcpu_id}] SystemDown");
if let Err(err) = vm.shutdown() {
warn!("VM[{vm_id}] shutdown failed: {err:?}");
}
notify_all_vcpus(vm_id);
}
AxVCpuExitReason::SendIPI {
target_cpu,
target_cpu_aux,
send_to_all,
send_to_self,
vector,
} => {
debug!(
"VM[{vm_id}] run VCpu[{vcpu_id}] SendIPI, target_cpu={target_cpu:#x}, \
target_cpu_aux={target_cpu_aux:#x}, vector={vector}",
);
let targets = ipi_targets(
&vm,
vcpu_id,
target_cpu,
target_cpu_aux,
send_to_all,
send_to_self,
);
if targets.is_empty() {
warn!(
"VM[{vm_id}] SendIPI has no target: target_cpu={target_cpu:#x}, \
target_cpu_aux={target_cpu_aux:#x}"
);
continue;
}
if targets.get(vcpu_id) {
crate::inject_current_vcpu_interrupt(vector as _)
.expect("failed to inject self IPI into current vCPU");
}
let mut remote_targets = targets;
remote_targets.set(vcpu_id, false);
if !remote_targets.is_empty()
&& let Err(err) = vm.inject_interrupt_to_vcpu(remote_targets, vector as _)
{
warn!(
"Failed to inject interrupt {vector} to VM[{vm_id}] targets \
{remote_targets:?}: {err:?}"
);
}
}
e => {
warn!("VM[{vm_id}] run VCpu[{vcpu_id}] unhandled vmexit: {e:?}");
}
},
Err(err) => {
error!("VM[{vm_id}] run VCpu[{vcpu_id}] get error {err:?}");
if let Err(err) = vm.shutdown() {
warn!("VM[{vm_id}] shutdown failed after vCPU error: {err:?}");
}
notify_all_vcpus(vm_id);
}
}
if vm.suspending() {
debug!(
"VM[{}] VCpu[{}] is suspended, waiting for resume...",
vm_id, vcpu_id
);
wait_for(&vm_vcpus, || !vm.suspending());
info!("VM[{}] VCpu[{}] resumed from suspend", vm_id, vcpu_id);
continue;
}
if vm.stopping() {
warn!(
"VM[{}] VCpu[{}] stopping because of VM stopping",
vm_id, vcpu_id
);
if vm_vcpus.mark_vcpu_exiting() {
info!("VM[{vm_id}] VCpu[{vcpu_id}] last VCpu exiting, decreasing running VM count");
vm.set_vm_status(crate::VMStatus::Stopped);
info!("VM[{}] state changed to Stopped", vm_id);
#[cfg(target_arch = "x86_64")]
super::x86_irq::disable_ioapic_irq_forwarding_for_vm(vm_id);
sub_running_vm_count(1);
crate::host::task::wait_queue_wake(&super::VMM, 1);
}
break;
}
}
info!("VM[{}] VCpu[{}] exiting...", vm_id, vcpu_id);
}