hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18use tracing::{Span, instrument};
19
20use crate::error::HyperlightError::ExecutionCanceledByHost;
21use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
22use crate::metrics::METRIC_GUEST_CANCELLATION;
23use crate::{HyperlightError, Result, log_then_return, new_error};
24
25/// Util for handling x87 fpu state
26#[cfg(any(kvm, mshv, target_os = "windows"))]
27pub mod fpu;
28/// Handlers for Hypervisor custom logic
29pub mod handlers;
30/// HyperV-on-linux functionality
31#[cfg(mshv)]
32pub mod hyperv_linux;
33#[cfg(target_os = "windows")]
34/// Hyperv-on-windows functionality
35pub(crate) mod hyperv_windows;
36
37/// GDB debugging support
38#[cfg(gdb)]
39pub(crate) mod gdb;
40
41#[cfg(kvm)]
42/// Functionality to manipulate KVM-based virtual machines
43pub mod kvm;
44#[cfg(target_os = "windows")]
45/// Hyperlight Surrogate Process
46pub(crate) mod surrogate_process;
47#[cfg(target_os = "windows")]
48/// Hyperlight Surrogate Process
49pub(crate) mod surrogate_process_manager;
50/// WindowsHypervisorPlatform utilities
51#[cfg(target_os = "windows")]
52pub(crate) mod windows_hypervisor_platform;
53/// Safe wrappers around windows types like `PSTR`
54#[cfg(target_os = "windows")]
55pub(crate) mod wrappers;
56
57#[cfg(crashdump)]
58pub(crate) mod crashdump;
59
60use std::fmt::Debug;
61use std::str::FromStr;
62#[cfg(any(kvm, mshv))]
63use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
64use std::sync::{Arc, Mutex};
65#[cfg(any(kvm, mshv))]
66use std::time::Duration;
67
68#[cfg(gdb)]
69use gdb::VcpuStopReason;
70
71#[cfg(gdb)]
72use self::handlers::{DbgMemAccessHandlerCaller, DbgMemAccessHandlerWrapper};
73use self::handlers::{
74    MemAccessHandlerCaller, MemAccessHandlerWrapper, OutBHandlerCaller, OutBHandlerWrapper,
75};
76use crate::mem::ptr::RawPtr;
77
78pub(crate) const CR4_PAE: u64 = 1 << 5;
79pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
80pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
81pub(crate) const CR0_PE: u64 = 1;
82pub(crate) const CR0_MP: u64 = 1 << 1;
83pub(crate) const CR0_ET: u64 = 1 << 4;
84pub(crate) const CR0_NE: u64 = 1 << 5;
85pub(crate) const CR0_WP: u64 = 1 << 16;
86pub(crate) const CR0_AM: u64 = 1 << 18;
87pub(crate) const CR0_PG: u64 = 1 << 31;
88pub(crate) const EFER_LME: u64 = 1 << 8;
89pub(crate) const EFER_LMA: u64 = 1 << 10;
90pub(crate) const EFER_SCE: u64 = 1;
91pub(crate) const EFER_NX: u64 = 1 << 11;
92
93/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
94/// the hypervisor specific exit reasons to these generic ones
95pub enum HyperlightExit {
96    #[cfg(gdb)]
97    /// The vCPU has exited due to a debug event
98    Debug(VcpuStopReason),
99    /// The vCPU has halted
100    Halt(),
101    /// The vCPU has issued a write to the given port with the given value
102    IoOut(u16, Vec<u8>, u64, u64),
103    /// The vCPU has attempted to read or write from an unmapped address
104    Mmio(u64),
105    /// The vCPU tried to access memory but was missing the required permissions
106    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
107    /// The vCPU execution has been cancelled
108    Cancelled(),
109    /// The vCPU has exited for a reason that is not handled by Hyperlight
110    Unknown(String),
111    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
112    Retry(),
113}
114
115/// A common set of hypervisor functionality
116pub(crate) trait Hypervisor: Debug + Sync + Send {
117    /// Initialise the internally stored vCPU with the given PEB address and
118    /// random number seed, then run it until a HLT instruction.
119    #[allow(clippy::too_many_arguments)]
120    fn initialise(
121        &mut self,
122        peb_addr: RawPtr,
123        seed: u64,
124        page_size: u32,
125        outb_handle_fn: OutBHandlerWrapper,
126        mem_access_fn: MemAccessHandlerWrapper,
127        guest_max_log_level: Option<LevelFilter>,
128        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
129    ) -> Result<()>;
130
131    /// Dispatch a call from the host to the guest using the given pointer
132    /// to the dispatch function _in the guest's address space_.
133    ///
134    /// Do this by setting the instruction pointer to `dispatch_func_addr`
135    /// and then running the execution loop until a halt instruction.
136    ///
137    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
138    fn dispatch_call_from_host(
139        &mut self,
140        dispatch_func_addr: RawPtr,
141        outb_handle_fn: OutBHandlerWrapper,
142        mem_access_fn: MemAccessHandlerWrapper,
143        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
144    ) -> Result<()>;
145
146    /// Handle an IO exit from the internally stored vCPU.
147    fn handle_io(
148        &mut self,
149        port: u16,
150        data: Vec<u8>,
151        rip: u64,
152        instruction_length: u64,
153        outb_handle_fn: OutBHandlerWrapper,
154    ) -> Result<()>;
155
156    /// Run the vCPU
157    fn run(&mut self) -> Result<HyperlightExit>;
158
159    /// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
160    /// access its corresponding region. Returns None otherwise, or if the region is not found.
161    fn get_memory_access_violation(
162        &self,
163        gpa: usize,
164        mem_regions: &[MemoryRegion],
165        access_info: MemoryRegionFlags,
166    ) -> Option<HyperlightExit> {
167        // find the region containing the given gpa
168        let region = mem_regions
169            .iter()
170            .find(|region| region.guest_region.contains(&gpa));
171
172        if let Some(region) = region {
173            if !region.flags.contains(access_info)
174                || region.flags.contains(MemoryRegionFlags::STACK_GUARD)
175            {
176                return Some(HyperlightExit::AccessViolation(
177                    gpa as u64,
178                    access_info,
179                    region.flags,
180                ));
181            }
182        }
183        None
184    }
185
186    /// Get InterruptHandle to underlying VM
187    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
188
189    /// Get the logging level to pass to the guest entrypoint
190    fn get_max_log_level(&self) -> u32 {
191        // Check to see if the RUST_LOG environment variable is set
192        // and if so, parse it to get the log_level for hyperlight_guest
193        // if that is not set get the log level for the hyperlight_host
194
195        // This is done as the guest will produce logs based on the log level returned here
196        // producing those logs is expensive and we don't want to do it if the host is not
197        // going to process them
198
199        let val = std::env::var("RUST_LOG").unwrap_or_default();
200
201        let level = if val.contains("hyperlight_guest") {
202            val.split(',')
203                .find(|s| s.contains("hyperlight_guest"))
204                .unwrap_or("")
205                .split('=')
206                .nth(1)
207                .unwrap_or("")
208        } else if val.contains("hyperlight_host") {
209            val.split(',')
210                .find(|s| s.contains("hyperlight_host"))
211                .unwrap_or("")
212                .split('=')
213                .nth(1)
214                .unwrap_or("")
215        } else {
216            // look for a value string that does not contain "="
217            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
218        };
219
220        log::info!("Determined guest log level: {}", level);
221        // Convert the log level string to a LevelFilter
222        // If no value is found, default to Error
223        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
224    }
225
226    /// get a mutable trait object from self
227    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
228
229    #[cfg(crashdump)]
230    fn get_memory_regions(&self) -> &[MemoryRegion];
231
232    #[cfg(gdb)]
233    /// handles the cases when the vCPU stops due to a Debug event
234    fn handle_debug(
235        &mut self,
236        _dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
237        _stop_reason: VcpuStopReason,
238    ) -> Result<()> {
239        unimplemented!()
240    }
241}
242
243/// A virtual CPU that can be run until an exit occurs
244pub struct VirtualCPU {}
245
246impl VirtualCPU {
247    /// Run the given hypervisor until a halt instruction is reached
248    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
249    pub fn run(
250        hv: &mut dyn Hypervisor,
251        outb_handle_fn: Arc<Mutex<dyn OutBHandlerCaller>>,
252        mem_access_fn: Arc<Mutex<dyn MemAccessHandlerCaller>>,
253        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
254    ) -> Result<()> {
255        loop {
256            match hv.run() {
257                #[cfg(gdb)]
258                Ok(HyperlightExit::Debug(stop_reason)) => {
259                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
260                        log_then_return!(e);
261                    }
262                }
263
264                Ok(HyperlightExit::Halt()) => {
265                    break;
266                }
267                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
268                    hv.handle_io(port, data, rip, instruction_length, outb_handle_fn.clone())?
269                }
270                Ok(HyperlightExit::Mmio(addr)) => {
271                    #[cfg(crashdump)]
272                    crashdump::crashdump_to_tempfile(hv)?;
273
274                    mem_access_fn
275                        .clone()
276                        .try_lock()
277                        .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?
278                        .call()?;
279
280                    log_then_return!("MMIO access address {:#x}", addr);
281                }
282                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
283                    #[cfg(crashdump)]
284                    crashdump::crashdump_to_tempfile(hv)?;
285
286                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
287                        return Err(HyperlightError::StackOverflow());
288                    }
289                    log_then_return!(HyperlightError::MemoryAccessViolation(
290                        addr,
291                        tried,
292                        region_permission
293                    ));
294                }
295                Ok(HyperlightExit::Cancelled()) => {
296                    // Shutdown is returned when the host has cancelled execution
297                    // After termination, the main thread will re-initialize the VM
298                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
299                    log_then_return!(ExecutionCanceledByHost());
300                }
301                Ok(HyperlightExit::Unknown(reason)) => {
302                    #[cfg(crashdump)]
303                    crashdump::crashdump_to_tempfile(hv)?;
304
305                    log_then_return!("Unexpected VM Exit {:?}", reason);
306                }
307                Ok(HyperlightExit::Retry()) => continue,
308                Err(e) => {
309                    #[cfg(crashdump)]
310                    crashdump::crashdump_to_tempfile(hv)?;
311
312                    return Err(e);
313                }
314            }
315        }
316
317        Ok(())
318    }
319}
320
321/// A trait for handling interrupts to a sandbox's vcpu
322pub trait InterruptHandle: Send + Sync {
323    /// Interrupt the corresponding sandbox from running.
324    ///
325    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
326    /// - If this is called while the vcpu is not running, (for example during a host call), the
327    ///     vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
328    ///     it's scheduled, and returns `false`.
329    ///
330    /// # Note
331    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
332    fn kill(&self) -> bool;
333
334    /// Returns true iff the corresponding sandbox has been dropped
335    fn dropped(&self) -> bool;
336}
337
338#[cfg(any(kvm, mshv))]
339#[derive(Debug)]
340pub(super) struct LinuxInterruptHandle {
341    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
342    ///
343    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
344    ///
345    /// This prevents an ABA problem where:
346    /// 1. The VCPU is running (generation N),
347    /// 2. It gets cancelled,
348    /// 3. Then quickly restarted (generation N+1),
349    ///     before the original thread has observed that it was cancelled.
350    ///
351    /// Without this generation counter, the interrupt logic might assume the VCPU is still
352    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
353    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
354    /// potentially causing an infinite loop where no effective interrupt is delivered.
355    ///
356    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
357    running: AtomicU64,
358    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
359    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
360    tid: AtomicU64,
361    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
362    /// `kill()` is called, and cleared when the vcpu is no longer running.
363    /// This is used to
364    /// 1. make sure stale signals do not interrupt the
365    ///     the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
366    /// 2. ensure that if a vm is killed while a host call is running,
367    ///     the vm will not re-enter the guest after the host call returns.
368    cancel_requested: AtomicBool,
369    /// Whether the corresponding vm is dropped
370    dropped: AtomicBool,
371    /// Retry delay between signals sent to the vcpu thread
372    retry_delay: Duration,
373    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
374    sig_rt_min_offset: u8,
375}
376
377#[cfg(any(kvm, mshv))]
378impl LinuxInterruptHandle {
379    const RUNNING_BIT: u64 = 1 << 63;
380    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
381
382    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
383    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
384        self.running
385            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
386                let generation = raw & !Self::RUNNING_BIT;
387                if generation == Self::MAX_GENERATION {
388                    // restart generation from 0
389                    return Some(Self::RUNNING_BIT);
390                }
391                Some((generation + 1) | Self::RUNNING_BIT)
392            })
393    }
394
395    // clear the running bit and return the generation
396    fn clear_running_bit(&self) -> u64 {
397        self.running
398            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
399    }
400
401    fn get_running_and_generation(&self) -> (bool, u64) {
402        let raw = self.running.load(Ordering::Relaxed);
403        let running = raw & Self::RUNNING_BIT != 0;
404        let generation = raw & !Self::RUNNING_BIT;
405        (running, generation)
406    }
407}
408
409#[cfg(any(kvm, mshv))]
410impl InterruptHandle for LinuxInterruptHandle {
411    fn kill(&self) -> bool {
412        self.cancel_requested.store(true, Ordering::Relaxed);
413
414        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
415        let mut sent_signal = false;
416        let mut target_generation: Option<u64> = None;
417
418        loop {
419            let (running, generation) = self.get_running_and_generation();
420
421            if !running {
422                break;
423            }
424
425            match target_generation {
426                None => target_generation = Some(generation),
427                // prevent ABA problem
428                Some(expected) if expected != generation => break,
429                _ => {}
430            }
431
432            log::info!("Sending signal to kill vcpu thread...");
433            sent_signal = true;
434            unsafe {
435                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
436            }
437            std::thread::sleep(self.retry_delay);
438        }
439
440        sent_signal
441    }
442    fn dropped(&self) -> bool {
443        self.dropped.load(Ordering::Relaxed)
444    }
445}
446
447#[cfg(all(test, any(target_os = "windows", kvm)))]
448pub(crate) mod tests {
449    use std::sync::{Arc, Mutex};
450
451    use hyperlight_testing::dummy_guest_as_string;
452
453    use super::handlers::{MemAccessHandler, OutBHandler};
454    #[cfg(gdb)]
455    use crate::hypervisor::DbgMemAccessHandlerCaller;
456    use crate::mem::ptr::RawPtr;
457    use crate::sandbox::uninitialized::GuestBinary;
458    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
459    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
460    use crate::{Result, is_hypervisor_present, new_error};
461
462    #[cfg(gdb)]
463    struct DbgMemAccessHandler {}
464
465    #[cfg(gdb)]
466    impl DbgMemAccessHandlerCaller for DbgMemAccessHandler {
467        fn read(&mut self, _offset: usize, _data: &mut [u8]) -> Result<()> {
468            Ok(())
469        }
470
471        fn write(&mut self, _offset: usize, _data: &[u8]) -> Result<()> {
472            Ok(())
473        }
474
475        fn get_code_offset(&mut self) -> Result<usize> {
476            Ok(0)
477        }
478    }
479
480    #[test]
481    fn test_initialise() -> Result<()> {
482        if !is_hypervisor_present() {
483            return Ok(());
484        }
485
486        let outb_handler: Arc<Mutex<OutBHandler>> = {
487            let func: Box<dyn FnMut(u16, u32) -> Result<()> + Send> =
488                Box::new(|_, _| -> Result<()> { Ok(()) });
489            Arc::new(Mutex::new(OutBHandler::from(func)))
490        };
491        let mem_access_handler = {
492            let func: Box<dyn FnMut() -> Result<()> + Send> = Box::new(|| -> Result<()> { Ok(()) });
493            Arc::new(Mutex::new(MemAccessHandler::from(func)))
494        };
495        #[cfg(gdb)]
496        let dbg_mem_access_handler = Arc::new(Mutex::new(DbgMemAccessHandler {}));
497
498        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
499
500        let config: SandboxConfiguration = Default::default();
501        let sandbox =
502            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
503        let (_hshm, mut gshm) = sandbox.mgr.build();
504        let mut vm = set_up_hypervisor_partition(&mut gshm, &config)?;
505        vm.initialise(
506            RawPtr::from(0x230000),
507            1234567890,
508            4096,
509            outb_handler,
510            mem_access_handler,
511            None,
512            #[cfg(gdb)]
513            dbg_mem_access_handler,
514        )
515    }
516}