use crate::cloudhv::api::{APIClient, RestoreConfig};
use crate::cloudhv::config::{CloudHypervisorConfig, DeviceConfig, VMConfig, VMSpec};
use crate::cloudhv::errors::{CloudHypervisorError, Result};
use crate::cloudhv::process::CloudHypervisorProcess;
use crate::cloudhv::snapshot::{SnapshotInfo, SnapshotManager};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct VMHandle {
pub id: String,
}
impl VMHandle {
pub fn new(id: impl Into<String>) -> Self {
Self { id: id.into() }
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub enum VMStatus {
Created,
Running,
Paused,
Shutdown,
Unknown,
}
#[derive(Clone)]
struct VMState {
#[allow(dead_code)]
spec: VMSpec,
status: VMStatus,
}
pub struct CloudHypervisorRuntime {
config: CloudHypervisorConfig,
process: Arc<RwLock<Option<CloudHypervisorProcess>>>,
api_client: Arc<RwLock<Option<APIClient>>>,
vm_state: Arc<RwLock<Option<VMState>>>,
snapshot_manager: Arc<SnapshotManager>,
}
impl CloudHypervisorRuntime {
pub fn new(config: CloudHypervisorConfig) -> Result<Self> {
config.validate()?;
let snapshot_base = PathBuf::from("/tmp/ch-snapshots");
let snapshot_manager = SnapshotManager::new(snapshot_base)?;
Ok(Self {
config,
process: Arc::new(RwLock::new(None)),
api_client: Arc::new(RwLock::new(None)),
vm_state: Arc::new(RwLock::new(None)),
snapshot_manager: Arc::new(snapshot_manager),
})
}
pub fn start(&self) -> Result<()> {
let mut process_guard = self
.process
.write()
.map_err(|e| CloudHypervisorError::Process(format!("Failed to lock process: {}", e)))?;
if process_guard.is_some() {
return Err(CloudHypervisorError::Process(
"Runtime already started".to_string(),
));
}
let mut process = CloudHypervisorProcess::new(&self.config.binary_path)?
.with_api_socket(&self.config.api_socket_path);
if let Some(log_file) = &self.config.log_file {
process = process.with_log_file(log_file);
}
process = process
.with_seccomp(self.config.seccomp)
.with_landlock(self.config.landlock);
process.start()?;
let api_client = APIClient::new(&self.config.api_socket_path)?;
let mut api_guard = self
.api_client
.write()
.map_err(|e| CloudHypervisorError::Api(format!("Failed to lock API client: {}", e)))?;
*api_guard = Some(api_client);
*process_guard = Some(process);
Ok(())
}
pub fn stop(&self) -> Result<()> {
let mut process_guard = self
.process
.write()
.map_err(|e| CloudHypervisorError::Process(format!("Failed to lock process: {}", e)))?;
if let Some(mut process) = process_guard.take() {
process.stop()?;
}
let mut api_guard = self
.api_client
.write()
.map_err(|e| CloudHypervisorError::Api(format!("Failed to lock API client: {}", e)))?;
*api_guard = None;
Ok(())
}
pub async fn create_vm(&self, spec: VMSpec) -> Result<VMHandle> {
let mut vm_state_guard = self.vm_state.write().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
if vm_state_guard.is_some() {
return Err(CloudHypervisorError::VmAlreadyExists(
"Cloud Hypervisor runtime already has a VM. Each runtime instance manages exactly one VM.".to_string(),
));
}
let api_client = self.get_api_client()?;
let vm_id = api_client.vm_create(spec.config.clone()).await?;
let handle = VMHandle::new(vm_id.clone());
*vm_state_guard = Some(VMState {
spec,
status: VMStatus::Created,
});
Ok(handle)
}
pub async fn start_vm(&self, handle: &VMHandle) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
api_client.vm_boot().await?;
self.update_vm_status(VMStatus::Running)?;
Ok(())
}
pub async fn stop_vm(&self, handle: &VMHandle, force: bool) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
if force {
api_client.vm_delete().await?;
} else {
api_client.vm_shutdown().await?;
}
self.update_vm_status(VMStatus::Shutdown)?;
Ok(())
}
pub async fn stop_vm_graceful(
&self,
handle: &VMHandle,
api_timeout: std::time::Duration,
sigterm_timeout: std::time::Duration,
sigkill_timeout: std::time::Duration,
) -> Result<()> {
use crate::cloudhv::discovery;
use std::time::Instant;
self.validate_vm_handle(handle)?;
let process_info = discovery::find_vm_process(&handle.id).ok();
let api_client = self.get_api_client()?;
if let Err(e) = api_client.vm_shutdown().await {
log::warn!("API shutdown failed: {}, will try signals", e);
}
tokio::time::sleep(api_timeout).await;
if let Some(ref info) = process_info {
if !discovery::is_process_running(info.pid) {
self.update_vm_status(VMStatus::Shutdown)?;
return Ok(());
}
log::debug!(
"VM still running after API shutdown, sending SIGTERM to PID {}",
info.pid
);
unsafe {
libc::kill(info.pid as i32, libc::SIGTERM);
}
let sigterm_start = Instant::now();
while sigterm_start.elapsed() < sigterm_timeout {
if !discovery::is_process_running(info.pid) {
self.update_vm_status(VMStatus::Shutdown)?;
return Ok(());
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
log::debug!(
"VM still running after SIGTERM, sending SIGKILL to PID {}",
info.pid
);
unsafe {
libc::kill(info.pid as i32, libc::SIGKILL);
}
let sigkill_start = Instant::now();
while sigkill_start.elapsed() < sigkill_timeout {
if !discovery::is_process_running(info.pid) {
self.update_vm_status(VMStatus::Shutdown)?;
return Ok(());
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
if discovery::is_process_running(info.pid) {
return Err(CloudHypervisorError::Process(format!(
"Failed to kill VM process {} after SIGKILL",
info.pid
)));
}
}
self.update_vm_status(VMStatus::Shutdown)?;
Ok(())
}
pub async fn pause_vm(&self, handle: &VMHandle) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
api_client.vm_pause().await?;
self.update_vm_status(VMStatus::Paused)?;
Ok(())
}
pub async fn resume_vm(&self, handle: &VMHandle) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
api_client.vm_resume().await?;
self.update_vm_status(VMStatus::Running)?;
Ok(())
}
pub async fn reset_vm(&self, handle: &VMHandle) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
api_client.vm_power_button().await?;
Ok(())
}
pub async fn resize_memory(&self, handle: &VMHandle, size: u64) -> Result<()> {
self.validate_vm_handle(handle)?;
let vm_state_guard = self.vm_state.read().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
if let Some(state) = vm_state_guard.as_ref() {
let current_memory = state.spec.config.memory.size;
if size < current_memory {
return Err(CloudHypervisorError::NotSupported {
feature: "Memory hot-shrink".to_string(),
reason: "Cloud Hypervisor only supports growing memory, not shrinking. Memory can only be increased from its initial size.".to_string(),
});
}
}
let api_client = self.get_api_client()?;
api_client.vm_resize_memory(size).await?;
Ok(())
}
pub async fn add_vcpu(&self, handle: &VMHandle, count: u32) -> Result<()> {
self.validate_vm_handle(handle)?;
if count == 0 {
return Err(CloudHypervisorError::Validation(
"vCPU count must be greater than 0".to_string(),
));
}
let api_client = self.get_api_client()?;
let vm_info = api_client.vm_info().await?;
let current_vcpus = vm_info.config["cpus"]["boot_vcpus"].as_u64().unwrap_or(1) as u32;
let new_vcpus = current_vcpus + count;
api_client.vm_resize_vcpu(new_vcpus).await?;
Ok(())
}
pub async fn remove_vcpu(&self, _handle: &VMHandle, _count: u32) -> Result<()> {
Err(CloudHypervisorError::NotSupported {
feature: "vCPU hot-removal".to_string(),
reason: "Cloud Hypervisor does not support vCPU hot-removal. This is a fundamental limitation of the Linux kernel's vCPU management and ACPI hotplug subsystem. You can only hot-add vCPUs, not remove them.".to_string(),
})
}
pub async fn get_vm_status(&self, handle: &VMHandle) -> Result<VMStatus> {
self.validate_vm_handle(handle)?;
let vm_state_guard = self.vm_state.read().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
vm_state_guard
.as_ref()
.map(|state| state.status.clone())
.ok_or_else(|| CloudHypervisorError::VmNotFound(handle.id.clone()))
}
pub async fn add_device(&self, handle: &VMHandle, device: DeviceConfig) -> Result<String> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
let device_id = api_client.vm_add_device(device).await?;
Ok(device_id)
}
pub async fn remove_device(&self, handle: &VMHandle, device_id: &str) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
api_client.vm_remove_device(device_id).await?;
Ok(())
}
pub async fn create_snapshot(&self, handle: &VMHandle, path: &Path) -> Result<()> {
self.validate_vm_handle(handle)?;
let api_client = self.get_api_client()?;
let destination_url = format!("file://{}", path.display());
api_client.vm_snapshot(&destination_url).await?;
let snapshot_name = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("snapshot");
self.snapshot_manager
.create_snapshot(&handle.id, snapshot_name, &destination_url)?;
Ok(())
}
pub async fn restore_snapshot(&self, path: &Path) -> Result<VMHandle> {
let mut vm_state_guard = self.vm_state.write().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
if vm_state_guard.is_some() {
return Err(CloudHypervisorError::VmAlreadyExists(
"Cannot restore snapshot: runtime already has a VM".to_string(),
));
}
let api_client = self.get_api_client()?;
let source_url = format!("file://{}", path.display());
let restore_config = RestoreConfig {
source_url,
prefault: false,
};
api_client.vm_restore(restore_config).await?;
let _vm_info = api_client.vm_info().await?;
let vm_id = uuid::Uuid::new_v4().to_string();
let handle = VMHandle::new(vm_id.clone());
*vm_state_guard = Some(VMState {
spec: VMSpec {
id: vm_id,
config: VMConfig::new(),
},
status: VMStatus::Created,
});
Ok(handle)
}
pub fn list_snapshots(&self, vm_id: &str) -> Result<Vec<SnapshotInfo>> {
self.snapshot_manager.list_snapshots(vm_id)
}
pub fn delete_snapshot(&self, vm_id: &str, name: &str) -> Result<()> {
self.snapshot_manager.delete_snapshot(vm_id, name)
}
pub fn get_api_client(&self) -> Result<APIClient> {
let api_guard = self
.api_client
.read()
.map_err(|e| CloudHypervisorError::Api(format!("Failed to lock API client: {}", e)))?;
api_guard
.as_ref()
.cloned()
.ok_or_else(|| CloudHypervisorError::Api("Runtime not started".to_string()))
}
fn validate_vm_handle(&self, handle: &VMHandle) -> Result<()> {
let vm_state_guard = self.vm_state.read().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
vm_state_guard
.as_ref()
.ok_or_else(|| CloudHypervisorError::VmNotFound(handle.id.clone()))?;
Ok(())
}
fn update_vm_status(&self, status: VMStatus) -> Result<()> {
let mut vm_state_guard = self.vm_state.write().map_err(|e| {
CloudHypervisorError::InvalidState(format!("Failed to lock VM state: {}", e))
})?;
if let Some(state) = vm_state_guard.as_mut() {
state.status = status;
}
Ok(())
}
}
impl Drop for CloudHypervisorRuntime {
fn drop(&mut self) {
let _ = self.stop();
}
}