herolib-virt 0.3.13

Virtualization and container management for herolib (buildah, nerdctl, kubernetes)
Documentation
use crate::cloudhv::errors::{CloudHypervisorError, Result};
use std::fs;
use std::path::{Path, PathBuf};
use std::process::{Child, Command, ExitStatus, Stdio};
use std::sync::{Arc, Mutex};

pub struct CloudHypervisorProcess {
    binary_path: PathBuf,
    api_socket: PathBuf,
    log_file: Option<PathBuf>,
    seccomp: bool,
    landlock: bool,
    child: Arc<Mutex<Option<Child>>>,
}

impl CloudHypervisorProcess {
    pub fn new(binary: impl Into<PathBuf>) -> Result<Self> {
        let binary_path = binary.into();
        if !binary_path.exists() {
            return Err(CloudHypervisorError::Process(format!(
                "Binary not found: {}",
                binary_path.display()
            )));
        }

        Ok(Self {
            binary_path,
            api_socket: PathBuf::from("/tmp/ch-api.sock"),
            log_file: None,
            seccomp: true,
            landlock: true,
            child: Arc::new(Mutex::new(None)),
        })
    }

    pub fn with_api_socket(mut self, socket: impl Into<PathBuf>) -> Self {
        self.api_socket = socket.into();
        self
    }

    pub fn with_log_file(mut self, file: impl Into<PathBuf>) -> Self {
        self.log_file = Some(file.into());
        self
    }

    pub fn with_seccomp(mut self, enabled: bool) -> Self {
        self.seccomp = enabled;
        self
    }

    pub fn with_landlock(mut self, enabled: bool) -> Self {
        self.landlock = enabled;
        self
    }

    pub fn start(&mut self) -> Result<()> {
        let mut guard = self.child.lock().map_err(|e| {
            CloudHypervisorError::Process(format!("Failed to lock child process: {}", e))
        })?;

        if guard.is_some() {
            return Err(CloudHypervisorError::Process(
                "Process already started".to_string(),
            ));
        }

        if self.api_socket.exists() {
            std::fs::remove_file(&self.api_socket)?;
        }

        let mut cmd = Command::new(&self.binary_path);
        cmd.arg("--api-socket").arg(&self.api_socket);

        if let Some(log_file) = &self.log_file {
            cmd.arg("--log-file").arg(log_file);
        }

        if !self.seccomp {
            cmd.arg("--seccomp").arg("false");
        }

        if !self.landlock {
            cmd.arg("--landlock").arg("false");
        }

        cmd.stdin(Stdio::null())
            .stdout(Stdio::piped())
            .stderr(Stdio::piped());

        let mut child = cmd.spawn().map_err(|e| {
            CloudHypervisorError::Process(format!("Failed to spawn process: {}", e))
        })?;

        // Wait for socket with retries
        let max_retries = 20;
        let mut retries = 0;
        while retries < max_retries {
            if self.api_socket.exists() {
                break;
            }
            
            // Check if process is still running
            match child.try_wait() {
                Ok(Some(status)) => {
                    // Process exited
                    let mut stdout = String::new();
                    let mut stderr = String::new();
                    if let Some(mut out) = child.stdout.take() {
                        let _ = std::io::Read::read_to_string(&mut out, &mut stdout);
                    }
                    if let Some(mut err) = child.stderr.take() {
                        let _ = std::io::Read::read_to_string(&mut err, &mut stderr);
                    }
                    return Err(CloudHypervisorError::Process(format!(
                        "Cloud Hypervisor exited with status: {}. Stdout: {}. Stderr: {}",
                        status, stdout, stderr
                    )));
                }
                Ok(None) => {
                    // Still running, wait a bit
                    std::thread::sleep(std::time::Duration::from_millis(100));
                    retries += 1;
                }
                Err(e) => {
                    return Err(CloudHypervisorError::Process(format!(
                        "Failed to check process status: {}",
                        e
                    )));
                }
            }
        }

        if !self.api_socket.exists() {
            return Err(CloudHypervisorError::Process(
                "API socket not created after 2 seconds. Cloud Hypervisor may be slow to start or failed.".to_string(),
            ));
        }

        *guard = Some(child);
        Ok(())
    }

    pub fn stop(&mut self) -> Result<()> {
        let mut guard = self.child.lock().map_err(|e| {
            CloudHypervisorError::Process(format!("Failed to lock child process: {}", e))
        })?;

        if let Some(mut child) = guard.take() {
            child.kill()?;
            child.wait()?;
        }

        if self.api_socket.exists() {
            let _ = std::fs::remove_file(&self.api_socket);
        }

        Ok(())
    }

    pub fn wait(&self) -> Result<ExitStatus> {
        let mut guard = self.child.lock().map_err(|e| {
            CloudHypervisorError::Process(format!("Failed to lock child process: {}", e))
        })?;

        if let Some(child) = guard.as_mut() {
            let status = child.wait()?;
            Ok(status)
        } else {
            Err(CloudHypervisorError::Process(
                "No process running".to_string(),
            ))
        }
    }

    pub fn is_running(&self) -> bool {
        let guard = self.child.lock().ok();
        if let Some(guard) = guard {
            if let Some(child) = guard.as_ref() {
                return child.id() > 0;
            }
        }
        false
    }

    pub fn api_socket(&self) -> &Path {
        &self.api_socket
    }

    /// Get the process ID of the running Cloud Hypervisor process
    pub fn pid(&self) -> Result<u32> {
        let guard = self.child.lock().map_err(|e| {
            CloudHypervisorError::Process(format!("Failed to lock child process: {}", e))
        })?;

        if let Some(child) = guard.as_ref() {
            Ok(child.id())
        } else {
            Err(CloudHypervisorError::Process(
                "No process running".to_string(),
            ))
        }
    }

    /// Set the OOM (Out-of-Memory) score adjustment for the Cloud Hypervisor process.
    /// 
    /// Lower values make the process less likely to be killed by the OOM killer.
    /// Range: -1000 to 1000
    /// - -1000: Never kill
    /// - 0: Default
    /// - 1000: Always kill first
    /// 
    /// ZosBase uses -200 to protect VMs from being killed.
    pub fn set_oom_score_adj(&self, score: i32) -> Result<()> {
        if score < -1000 || score > 1000 {
            return Err(CloudHypervisorError::Validation(
                format!("OOM score must be between -1000 and 1000, got {}", score)
            ));
        }

        let pid = self.pid()?;
        let oom_path = format!("/proc/{}/oom_score_adj", pid);
        
        fs::write(&oom_path, score.to_string())
            .map_err(|e| CloudHypervisorError::Process(
                format!("Failed to set OOM score for PID {}: {}", pid, e)
            ))?;

        Ok(())
    }
}

impl Drop for CloudHypervisorProcess {
    fn drop(&mut self) {
        let _ = self.stop();
    }
}