hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18use tracing::{Span, instrument};
19
20use crate::error::HyperlightError::ExecutionCanceledByHost;
21use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
22use crate::metrics::METRIC_GUEST_CANCELLATION;
23use crate::{HyperlightError, Result, log_then_return, new_error};
24
25/// Util for handling x87 fpu state
26#[cfg(any(kvm, mshv, target_os = "windows"))]
27pub mod fpu;
28/// Handlers for Hypervisor custom logic
29pub mod handlers;
30/// HyperV-on-linux functionality
31#[cfg(mshv)]
32pub mod hyperv_linux;
33#[cfg(target_os = "windows")]
34/// Hyperv-on-windows functionality
35pub(crate) mod hyperv_windows;
36
37/// GDB debugging support
38#[cfg(gdb)]
39pub(crate) mod gdb;
40
41#[cfg(kvm)]
42/// Functionality to manipulate KVM-based virtual machines
43pub mod kvm;
44#[cfg(target_os = "windows")]
45/// Hyperlight Surrogate Process
46pub(crate) mod surrogate_process;
47#[cfg(target_os = "windows")]
48/// Hyperlight Surrogate Process
49pub(crate) mod surrogate_process_manager;
50/// WindowsHypervisorPlatform utilities
51#[cfg(target_os = "windows")]
52pub(crate) mod windows_hypervisor_platform;
53/// Safe wrappers around windows types like `PSTR`
54#[cfg(target_os = "windows")]
55pub(crate) mod wrappers;
56
57#[cfg(crashdump)]
58pub(crate) mod crashdump;
59
60use std::fmt::Debug;
61use std::str::FromStr;
62#[cfg(any(kvm, mshv))]
63use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
64use std::sync::{Arc, Mutex};
65#[cfg(any(kvm, mshv))]
66use std::time::Duration;
67
68#[cfg(gdb)]
69use gdb::VcpuStopReason;
70
71#[cfg(gdb)]
72use self::handlers::{DbgMemAccessHandlerCaller, DbgMemAccessHandlerWrapper};
73use self::handlers::{
74    MemAccessHandlerCaller, MemAccessHandlerWrapper, OutBHandlerCaller, OutBHandlerWrapper,
75};
76use crate::mem::ptr::RawPtr;
77
78cfg_if::cfg_if! {
79    if #[cfg(feature = "init-paging")] {
80        pub(crate) const CR4_PAE: u64 = 1 << 5;
81        pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
82        pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
83        pub(crate) const CR0_PE: u64 = 1;
84        pub(crate) const CR0_MP: u64 = 1 << 1;
85        pub(crate) const CR0_ET: u64 = 1 << 4;
86        pub(crate) const CR0_NE: u64 = 1 << 5;
87        pub(crate) const CR0_WP: u64 = 1 << 16;
88        pub(crate) const CR0_AM: u64 = 1 << 18;
89        pub(crate) const CR0_PG: u64 = 1 << 31;
90        pub(crate) const EFER_LME: u64 = 1 << 8;
91        pub(crate) const EFER_LMA: u64 = 1 << 10;
92        pub(crate) const EFER_SCE: u64 = 1;
93        pub(crate) const EFER_NX: u64 = 1 << 11;
94    }
95}
96
97/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
98/// the hypervisor specific exit reasons to these generic ones
99pub enum HyperlightExit {
100    #[cfg(gdb)]
101    /// The vCPU has exited due to a debug event
102    Debug(VcpuStopReason),
103    /// The vCPU has halted
104    Halt(),
105    /// The vCPU has issued a write to the given port with the given value
106    IoOut(u16, Vec<u8>, u64, u64),
107    /// The vCPU has attempted to read or write from an unmapped address
108    Mmio(u64),
109    /// The vCPU tried to access memory but was missing the required permissions
110    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
111    /// The vCPU execution has been cancelled
112    Cancelled(),
113    /// The vCPU has exited for a reason that is not handled by Hyperlight
114    Unknown(String),
115    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
116    Retry(),
117}
118
119/// A common set of hypervisor functionality
120pub(crate) trait Hypervisor: Debug + Sync + Send {
121    /// Initialise the internally stored vCPU with the given PEB address and
122    /// random number seed, then run it until a HLT instruction.
123    #[allow(clippy::too_many_arguments)]
124    fn initialise(
125        &mut self,
126        peb_addr: RawPtr,
127        seed: u64,
128        page_size: u32,
129        outb_handle_fn: OutBHandlerWrapper,
130        mem_access_fn: MemAccessHandlerWrapper,
131        guest_max_log_level: Option<LevelFilter>,
132        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
133    ) -> Result<()>;
134
135    /// Dispatch a call from the host to the guest using the given pointer
136    /// to the dispatch function _in the guest's address space_.
137    ///
138    /// Do this by setting the instruction pointer to `dispatch_func_addr`
139    /// and then running the execution loop until a halt instruction.
140    ///
141    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
142    fn dispatch_call_from_host(
143        &mut self,
144        dispatch_func_addr: RawPtr,
145        outb_handle_fn: OutBHandlerWrapper,
146        mem_access_fn: MemAccessHandlerWrapper,
147        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
148    ) -> Result<()>;
149
150    /// Handle an IO exit from the internally stored vCPU.
151    fn handle_io(
152        &mut self,
153        port: u16,
154        data: Vec<u8>,
155        rip: u64,
156        instruction_length: u64,
157        outb_handle_fn: OutBHandlerWrapper,
158    ) -> Result<()>;
159
160    /// Run the vCPU
161    fn run(&mut self) -> Result<HyperlightExit>;
162
163    /// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
164    /// access its corresponding region. Returns None otherwise, or if the region is not found.
165    fn get_memory_access_violation(
166        &self,
167        gpa: usize,
168        mem_regions: &[MemoryRegion],
169        access_info: MemoryRegionFlags,
170    ) -> Option<HyperlightExit> {
171        // find the region containing the given gpa
172        let region = mem_regions
173            .iter()
174            .find(|region| region.guest_region.contains(&gpa));
175
176        if let Some(region) = region {
177            if !region.flags.contains(access_info)
178                || region.flags.contains(MemoryRegionFlags::STACK_GUARD)
179            {
180                return Some(HyperlightExit::AccessViolation(
181                    gpa as u64,
182                    access_info,
183                    region.flags,
184                ));
185            }
186        }
187        None
188    }
189
190    /// Get InterruptHandle to underlying VM
191    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
192
193    /// Get the logging level to pass to the guest entrypoint
194    fn get_max_log_level(&self) -> u32 {
195        // Check to see if the RUST_LOG environment variable is set
196        // and if so, parse it to get the log_level for hyperlight_guest
197        // if that is not set get the log level for the hyperlight_host
198
199        // This is done as the guest will produce logs based on the log level returned here
200        // producing those logs is expensive and we don't want to do it if the host is not
201        // going to process them
202
203        let val = std::env::var("RUST_LOG").unwrap_or_default();
204
205        let level = if val.contains("hyperlight_guest") {
206            val.split(',')
207                .find(|s| s.contains("hyperlight_guest"))
208                .unwrap_or("")
209                .split('=')
210                .nth(1)
211                .unwrap_or("")
212        } else if val.contains("hyperlight_host") {
213            val.split(',')
214                .find(|s| s.contains("hyperlight_host"))
215                .unwrap_or("")
216                .split('=')
217                .nth(1)
218                .unwrap_or("")
219        } else {
220            // look for a value string that does not contain "="
221            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
222        };
223
224        log::info!("Determined guest log level: {}", level);
225        // Convert the log level string to a LevelFilter
226        // If no value is found, default to Error
227        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
228    }
229
230    /// get a mutable trait object from self
231    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
232
233    #[cfg(crashdump)]
234    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>>;
235
236    #[cfg(gdb)]
237    /// handles the cases when the vCPU stops due to a Debug event
238    fn handle_debug(
239        &mut self,
240        _dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
241        _stop_reason: VcpuStopReason,
242    ) -> Result<()> {
243        unimplemented!()
244    }
245}
246
247/// A virtual CPU that can be run until an exit occurs
248pub struct VirtualCPU {}
249
250impl VirtualCPU {
251    /// Run the given hypervisor until a halt instruction is reached
252    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
253    pub fn run(
254        hv: &mut dyn Hypervisor,
255        outb_handle_fn: Arc<Mutex<dyn OutBHandlerCaller>>,
256        mem_access_fn: Arc<Mutex<dyn MemAccessHandlerCaller>>,
257        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
258    ) -> Result<()> {
259        loop {
260            match hv.run() {
261                #[cfg(gdb)]
262                Ok(HyperlightExit::Debug(stop_reason)) => {
263                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
264                        log_then_return!(e);
265                    }
266                }
267
268                Ok(HyperlightExit::Halt()) => {
269                    break;
270                }
271                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
272                    hv.handle_io(port, data, rip, instruction_length, outb_handle_fn.clone())?
273                }
274                Ok(HyperlightExit::Mmio(addr)) => {
275                    #[cfg(crashdump)]
276                    crashdump::generate_crashdump(hv)?;
277
278                    mem_access_fn
279                        .clone()
280                        .try_lock()
281                        .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?
282                        .call()?;
283
284                    log_then_return!("MMIO access address {:#x}", addr);
285                }
286                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
287                    #[cfg(crashdump)]
288                    crashdump::generate_crashdump(hv)?;
289
290                    // If GDB is enabled, we handle the debug memory access
291                    // Disregard return value as we want to return the error
292                    #[cfg(gdb)]
293                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
294
295                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
296                        return Err(HyperlightError::StackOverflow());
297                    }
298                    log_then_return!(HyperlightError::MemoryAccessViolation(
299                        addr,
300                        tried,
301                        region_permission
302                    ));
303                }
304                Ok(HyperlightExit::Cancelled()) => {
305                    // Shutdown is returned when the host has cancelled execution
306                    // After termination, the main thread will re-initialize the VM
307                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
308                    log_then_return!(ExecutionCanceledByHost());
309                }
310                Ok(HyperlightExit::Unknown(reason)) => {
311                    #[cfg(crashdump)]
312                    crashdump::generate_crashdump(hv)?;
313                    // If GDB is enabled, we handle the debug memory access
314                    // Disregard return value as we want to return the error
315                    #[cfg(gdb)]
316                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
317
318                    log_then_return!("Unexpected VM Exit {:?}", reason);
319                }
320                Ok(HyperlightExit::Retry()) => continue,
321                Err(e) => {
322                    #[cfg(crashdump)]
323                    crashdump::generate_crashdump(hv)?;
324                    // If GDB is enabled, we handle the debug memory access
325                    // Disregard return value as we want to return the error
326                    #[cfg(gdb)]
327                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
328
329                    return Err(e);
330                }
331            }
332        }
333
334        Ok(())
335    }
336}
337
338/// A trait for handling interrupts to a sandbox's vcpu
339pub trait InterruptHandle: Debug + Send + Sync {
340    /// Interrupt the corresponding sandbox from running.
341    ///
342    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
343    /// - If this is called while the vcpu is not running, (for example during a host call), the
344    ///     vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
345    ///     it's scheduled, and returns `false`.
346    ///
347    /// # Note
348    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
349    fn kill(&self) -> bool;
350
351    /// Used by a debugger to interrupt the corresponding sandbox from running.
352    ///
353    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
354    /// - If this is called while the vcpu is not running, (for example during a host call), the
355    ///     vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
356    ///     it's scheduled, and returns `false`.
357    ///
358    /// # Note
359    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
360    #[cfg(gdb)]
361    fn kill_from_debugger(&self) -> bool;
362
363    /// Returns true if the corresponding sandbox has been dropped
364    fn dropped(&self) -> bool;
365}
366
367#[cfg(any(kvm, mshv))]
368#[derive(Debug)]
369pub(super) struct LinuxInterruptHandle {
370    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
371    ///
372    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
373    ///
374    /// This prevents an ABA problem where:
375    /// 1. The VCPU is running (generation N),
376    /// 2. It gets cancelled,
377    /// 3. Then quickly restarted (generation N+1),
378    ///     before the original thread has observed that it was cancelled.
379    ///
380    /// Without this generation counter, the interrupt logic might assume the VCPU is still
381    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
382    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
383    /// potentially causing an infinite loop where no effective interrupt is delivered.
384    ///
385    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
386    running: AtomicU64,
387    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
388    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
389    tid: AtomicU64,
390    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
391    /// `kill()` is called, and cleared when the vcpu is no longer running.
392    /// This is used to
393    /// 1. make sure stale signals do not interrupt the
394    ///     the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
395    /// 2. ensure that if a vm is killed while a host call is running,
396    ///     the vm will not re-enter the guest after the host call returns.
397    cancel_requested: AtomicBool,
398    /// True when the debugger has requested the VM to be interrupted. Set immediately when
399    /// `kill_from_debugger()` is called, and cleared when the vcpu is no longer running.
400    /// This is used to make sure stale signals do not interrupt the the wrong vcpu
401    /// (a vcpu may only be interrupted by a debugger if `debug_interrupt` is true),
402    #[cfg(gdb)]
403    debug_interrupt: AtomicBool,
404    /// Whether the corresponding vm is dropped
405    dropped: AtomicBool,
406    /// Retry delay between signals sent to the vcpu thread
407    retry_delay: Duration,
408    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
409    sig_rt_min_offset: u8,
410}
411
412#[cfg(any(kvm, mshv))]
413impl LinuxInterruptHandle {
414    const RUNNING_BIT: u64 = 1 << 63;
415    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
416
417    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
418    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
419        self.running
420            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
421                let generation = raw & !Self::RUNNING_BIT;
422                if generation == Self::MAX_GENERATION {
423                    // restart generation from 0
424                    return Some(Self::RUNNING_BIT);
425                }
426                Some((generation + 1) | Self::RUNNING_BIT)
427            })
428    }
429
430    // clear the running bit and return the generation
431    fn clear_running_bit(&self) -> u64 {
432        self.running
433            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
434    }
435
436    fn get_running_and_generation(&self) -> (bool, u64) {
437        let raw = self.running.load(Ordering::Relaxed);
438        let running = raw & Self::RUNNING_BIT != 0;
439        let generation = raw & !Self::RUNNING_BIT;
440        (running, generation)
441    }
442
443    fn send_signal(&self) -> bool {
444        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
445        let mut sent_signal = false;
446        let mut target_generation: Option<u64> = None;
447
448        loop {
449            let (running, generation) = self.get_running_and_generation();
450
451            if !running {
452                break;
453            }
454
455            match target_generation {
456                None => target_generation = Some(generation),
457                // prevent ABA problem
458                Some(expected) if expected != generation => break,
459                _ => {}
460            }
461
462            log::info!("Sending signal to kill vcpu thread...");
463            sent_signal = true;
464            unsafe {
465                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
466            }
467            std::thread::sleep(self.retry_delay);
468        }
469
470        sent_signal
471    }
472}
473
474#[cfg(any(kvm, mshv))]
475impl InterruptHandle for LinuxInterruptHandle {
476    fn kill(&self) -> bool {
477        self.cancel_requested.store(true, Ordering::Relaxed);
478
479        self.send_signal()
480    }
481    #[cfg(gdb)]
482    fn kill_from_debugger(&self) -> bool {
483        self.debug_interrupt.store(true, Ordering::Relaxed);
484        self.send_signal()
485    }
486    fn dropped(&self) -> bool {
487        self.dropped.load(Ordering::Relaxed)
488    }
489}
490
491#[cfg(all(test, any(target_os = "windows", kvm)))]
492pub(crate) mod tests {
493    use std::sync::{Arc, Mutex};
494
495    use hyperlight_testing::dummy_guest_as_string;
496
497    use super::handlers::{MemAccessHandler, OutBHandler};
498    #[cfg(gdb)]
499    use crate::hypervisor::DbgMemAccessHandlerCaller;
500    use crate::mem::ptr::RawPtr;
501    use crate::sandbox::uninitialized::GuestBinary;
502    #[cfg(any(crashdump, gdb))]
503    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
504    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
505    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
506    use crate::{Result, is_hypervisor_present, new_error};
507
508    #[cfg(gdb)]
509    struct DbgMemAccessHandler {}
510
511    #[cfg(gdb)]
512    impl DbgMemAccessHandlerCaller for DbgMemAccessHandler {
513        fn read(&mut self, _offset: usize, _data: &mut [u8]) -> Result<()> {
514            Ok(())
515        }
516
517        fn write(&mut self, _offset: usize, _data: &[u8]) -> Result<()> {
518            Ok(())
519        }
520
521        fn get_code_offset(&mut self) -> Result<usize> {
522            Ok(0)
523        }
524    }
525
526    #[test]
527    fn test_initialise() -> Result<()> {
528        if !is_hypervisor_present() {
529            return Ok(());
530        }
531
532        let outb_handler: Arc<Mutex<OutBHandler>> = {
533            let func: Box<dyn FnMut(u16, u32) -> Result<()> + Send> =
534                Box::new(|_, _| -> Result<()> { Ok(()) });
535            Arc::new(Mutex::new(OutBHandler::from(func)))
536        };
537        let mem_access_handler = {
538            let func: Box<dyn FnMut() -> Result<()> + Send> = Box::new(|| -> Result<()> { Ok(()) });
539            Arc::new(Mutex::new(MemAccessHandler::from(func)))
540        };
541        #[cfg(gdb)]
542        let dbg_mem_access_handler = Arc::new(Mutex::new(DbgMemAccessHandler {}));
543
544        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
545
546        let config: SandboxConfiguration = Default::default();
547        #[cfg(any(crashdump, gdb))]
548        let rt_cfg: SandboxRuntimeConfig = Default::default();
549        let sandbox =
550            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
551        let (_hshm, mut gshm) = sandbox.mgr.build();
552        let mut vm = set_up_hypervisor_partition(
553            &mut gshm,
554            &config,
555            #[cfg(any(crashdump, gdb))]
556            &rt_cfg,
557        )?;
558        vm.initialise(
559            RawPtr::from(0x230000),
560            1234567890,
561            4096,
562            outb_handler,
563            mem_access_handler,
564            None,
565            #[cfg(gdb)]
566            dbg_mem_access_handler,
567        )
568    }
569}