hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::{LevelFilter, debug};
18use tracing::{Span, instrument};
19
20use crate::HyperlightError::StackOverflow;
21use crate::error::HyperlightError::ExecutionCanceledByHost;
22use crate::hypervisor::regs::{
23    CommonFpu, CommonRegisters, CommonSegmentRegister, CommonSpecialRegisters,
24};
25use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
26use crate::metrics::METRIC_GUEST_CANCELLATION;
27#[cfg(feature = "mem_profile")]
28use crate::sandbox::trace::MemTraceInfo;
29use crate::{HyperlightError, Result, log_then_return};
30
31/// HyperV-on-linux functionality
32#[cfg(mshv3)]
33pub mod hyperv_linux;
34#[cfg(target_os = "windows")]
35/// Hyperv-on-windows functionality
36pub(crate) mod hyperv_windows;
37
38/// GDB debugging support
39#[cfg(gdb)]
40pub(crate) mod gdb;
41
42/// Abstracts over different hypervisor register representations
43pub(crate) mod regs;
44
45#[cfg(kvm)]
46/// Functionality to manipulate KVM-based virtual machines
47pub mod kvm;
48#[cfg(target_os = "windows")]
49/// Hyperlight Surrogate Process
50pub(crate) mod surrogate_process;
51#[cfg(target_os = "windows")]
52/// Hyperlight Surrogate Process
53pub(crate) mod surrogate_process_manager;
54/// WindowsHypervisorPlatform utilities
55#[cfg(target_os = "windows")]
56pub(crate) mod windows_hypervisor_platform;
57/// Safe wrappers around windows types like `PSTR`
58#[cfg(target_os = "windows")]
59pub(crate) mod wrappers;
60
61#[cfg(crashdump)]
62pub(crate) mod crashdump;
63
64use std::fmt::Debug;
65use std::str::FromStr;
66use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
67use std::sync::{Arc, Mutex};
68#[cfg(any(kvm, mshv3))]
69use std::time::Duration;
70
71#[cfg(gdb)]
72use gdb::VcpuStopReason;
73
74use crate::mem::mgr::SandboxMemoryManager;
75use crate::mem::ptr::RawPtr;
76use crate::mem::shared_mem::HostSharedMemory;
77use crate::sandbox::host_funcs::FunctionRegistry;
78
79cfg_if::cfg_if! {
80    if #[cfg(feature = "init-paging")] {
81        pub(crate) const CR4_PAE: u64 = 1 << 5;
82        pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
83        pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
84        pub(crate) const CR0_PE: u64 = 1;
85        pub(crate) const CR0_MP: u64 = 1 << 1;
86        pub(crate) const CR0_ET: u64 = 1 << 4;
87        pub(crate) const CR0_NE: u64 = 1 << 5;
88        pub(crate) const CR0_WP: u64 = 1 << 16;
89        pub(crate) const CR0_AM: u64 = 1 << 18;
90        pub(crate) const CR0_PG: u64 = 1 << 31;
91        pub(crate) const EFER_LME: u64 = 1 << 8;
92        pub(crate) const EFER_LMA: u64 = 1 << 10;
93        pub(crate) const EFER_SCE: u64 = 1;
94        pub(crate) const EFER_NX: u64 = 1 << 11;
95    }
96}
97
98/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
99/// the hypervisor specific exit reasons to these generic ones
100pub enum HyperlightExit {
101    #[cfg(gdb)]
102    /// The vCPU has exited due to a debug event
103    Debug(VcpuStopReason),
104    /// The vCPU has halted
105    Halt(),
106    /// The vCPU has issued a write to the given port with the given value
107    IoOut(u16, Vec<u8>, u64, u64),
108    /// The vCPU has attempted to read or write from an unmapped address
109    Mmio(u64),
110    /// The vCPU tried to access memory but was missing the required permissions
111    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
112    /// The vCPU execution has been cancelled
113    Cancelled(),
114    /// The vCPU has exited for a reason that is not handled by Hyperlight
115    Unknown(String),
116    /// The operation should be retried
117    /// On Linux this can happen where a call to run the CPU can return EAGAIN
118    /// On Windows the platform could cause a cancelation of the VM run
119    Retry(),
120}
121
122/// A common set of hypervisor functionality
123pub(crate) trait Hypervisor: Debug + Send {
124    /// Initialise the internally stored vCPU with the given PEB address and
125    /// random number seed, then run it until a HLT instruction.
126    #[allow(clippy::too_many_arguments)]
127    fn initialise(
128        &mut self,
129        peb_addr: RawPtr,
130        seed: u64,
131        page_size: u32,
132        mem_mgr: SandboxMemoryManager<HostSharedMemory>,
133        host_funcs: Arc<Mutex<FunctionRegistry>>,
134        guest_max_log_level: Option<LevelFilter>,
135        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
136    ) -> Result<()>;
137
138    /// Map a region of host memory into the sandbox.
139    ///
140    /// Depending on the host platform, there are likely alignment
141    /// requirements of at least one page for base and len.
142    unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
143
144    /// Unmap a memory region from the sandbox
145    unsafe fn unmap_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
146
147    /// Get the currently mapped dynamic memory regions (not including sandbox regions)
148    ///
149    /// Note: Box needed for trait to be object-safe :(
150    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_>;
151
152    /// Dispatch a call from the host to the guest using the given pointer
153    /// to the dispatch function _in the guest's address space_.
154    ///
155    /// Do this by setting the instruction pointer to `dispatch_func_addr`
156    /// and then running the execution loop until a halt instruction.
157    ///
158    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
159    fn dispatch_call_from_host(
160        &mut self,
161        dispatch_func_addr: RawPtr,
162        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
163    ) -> Result<()>;
164
165    /// Handle an IO exit from the internally stored vCPU.
166    fn handle_io(
167        &mut self,
168        port: u16,
169        data: Vec<u8>,
170        rip: u64,
171        instruction_length: u64,
172    ) -> Result<()>;
173
174    /// Run the vCPU
175    fn run(
176        &mut self,
177        #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext,
178    ) -> Result<HyperlightExit>;
179
180    /// Get InterruptHandle to underlying VM (returns internal trait)
181    fn interrupt_handle(&self) -> Arc<dyn InterruptHandleInternal>;
182
183    /// Get regs
184    #[allow(dead_code)]
185    fn regs(&self) -> Result<CommonRegisters>;
186    /// Set regs
187    #[allow(dead_code)]
188    fn set_regs(&mut self, regs: &CommonRegisters) -> Result<()>;
189    /// Get fpu regs
190    #[allow(dead_code)]
191    fn fpu(&self) -> Result<CommonFpu>;
192    /// Set fpu regs
193    #[allow(dead_code)]
194    fn set_fpu(&mut self, fpu: &CommonFpu) -> Result<()>;
195    /// Get special regs
196    #[allow(dead_code)]
197    fn sregs(&self) -> Result<CommonSpecialRegisters>;
198    /// Set special regs
199    #[allow(dead_code)]
200    fn set_sregs(&mut self, sregs: &CommonSpecialRegisters) -> Result<()>;
201
202    /// Setup initial special registers for the hypervisor
203    /// This is a default implementation that works for all hypervisors
204    fn setup_initial_sregs(&mut self, _pml4_addr: u64) -> Result<()> {
205        #[cfg(feature = "init-paging")]
206        let sregs = CommonSpecialRegisters {
207            cr0: CR0_PE | CR0_MP | CR0_ET | CR0_NE | CR0_AM | CR0_PG | CR0_WP,
208            cr4: CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT,
209            cr3: _pml4_addr,
210            efer: EFER_LME | EFER_LMA | EFER_SCE | EFER_NX,
211            cs: CommonSegmentRegister {
212                type_: 11,
213                present: 1,
214                s: 1,
215                l: 1,
216                ..Default::default()
217            },
218            tr: CommonSegmentRegister {
219                limit: 65535,
220                type_: 11,
221                present: 1,
222                s: 0,
223                ..Default::default()
224            },
225            ..Default::default()
226        };
227
228        #[cfg(not(feature = "init-paging"))]
229        let sregs = CommonSpecialRegisters {
230            cs: CommonSegmentRegister {
231                base: 0,
232                selector: 0,
233                limit: 0xFFFF,
234                type_: 11,
235                present: 1,
236                s: 1,
237                ..Default::default()
238            },
239            ds: CommonSegmentRegister {
240                base: 0,
241                selector: 0,
242                limit: 0xFFFF,
243                type_: 3,
244                present: 1,
245                s: 1,
246                ..Default::default()
247            },
248            tr: CommonSegmentRegister {
249                base: 0,
250                selector: 0,
251                limit: 0xFFFF,
252                type_: 11,
253                present: 1,
254                s: 0,
255                ..Default::default()
256            },
257            ..Default::default()
258        };
259
260        self.set_sregs(&sregs)?;
261        Ok(())
262    }
263
264    /// Get the logging level to pass to the guest entrypoint
265    fn get_max_log_level(&self) -> u32 {
266        // Check to see if the RUST_LOG environment variable is set
267        // and if so, parse it to get the log_level for hyperlight_guest
268        // if that is not set get the log level for the hyperlight_host
269
270        // This is done as the guest will produce logs based on the log level returned here
271        // producing those logs is expensive and we don't want to do it if the host is not
272        // going to process them
273
274        let val = std::env::var("RUST_LOG").unwrap_or_default();
275
276        let level = if val.contains("hyperlight_guest") {
277            val.split(',')
278                .find(|s| s.contains("hyperlight_guest"))
279                .unwrap_or("")
280                .split('=')
281                .nth(1)
282                .unwrap_or("")
283        } else if val.contains("hyperlight_host") {
284            val.split(',')
285                .find(|s| s.contains("hyperlight_host"))
286                .unwrap_or("")
287                .split('=')
288                .nth(1)
289                .unwrap_or("")
290        } else {
291            // look for a value string that does not contain "="
292            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
293        };
294
295        log::info!("Determined guest log level: {}", level);
296        // Convert the log level string to a LevelFilter
297        // If no value is found, default to Error
298        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
299    }
300
301    /// get a mutable trait object from self
302    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
303
304    #[cfg(crashdump)]
305    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>>;
306
307    #[cfg(gdb)]
308    /// handles the cases when the vCPU stops due to a Debug event
309    fn handle_debug(
310        &mut self,
311        _dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
312        _stop_reason: VcpuStopReason,
313    ) -> Result<()> {
314        unimplemented!()
315    }
316
317    /// Check stack guard to see if the stack is still valid
318    fn check_stack_guard(&self) -> Result<bool>;
319
320    #[cfg(feature = "trace_guest")]
321    fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()>;
322
323    /// Get a mutable reference of the trace info for the guest
324    #[cfg(feature = "mem_profile")]
325    fn trace_info_mut(&mut self) -> &mut MemTraceInfo;
326}
327
328/// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
329/// access its corresponding region. Returns None otherwise, or if the region is not found.
330pub(crate) fn get_memory_access_violation<'a>(
331    gpa: usize,
332    mut mem_regions: impl Iterator<Item = &'a MemoryRegion>,
333    access_info: MemoryRegionFlags,
334) -> Option<HyperlightExit> {
335    // find the region containing the given gpa
336    let region = mem_regions.find(|region| region.guest_region.contains(&gpa));
337
338    if let Some(region) = region
339        && (!region.flags.contains(access_info)
340            || region.flags.contains(MemoryRegionFlags::STACK_GUARD))
341    {
342        return Some(HyperlightExit::AccessViolation(
343            gpa as u64,
344            access_info,
345            region.flags,
346        ));
347    }
348    None
349}
350
351/// A virtual CPU that can be run until an exit occurs
352pub struct VirtualCPU {}
353
354impl VirtualCPU {
355    /// Run the given hypervisor until a halt instruction is reached
356    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
357    pub(crate) fn run(
358        hv: &mut dyn Hypervisor,
359        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
360    ) -> Result<()> {
361        // Keeps the trace context and open spans
362        #[cfg(feature = "trace_guest")]
363        let mut tc = crate::sandbox::trace::TraceContext::new();
364
365        loop {
366            #[cfg(feature = "trace_guest")]
367            let result = {
368                let result = hv.run(&mut tc);
369                // End current host trace by closing the current span that captures traces
370                // happening when a guest exits and re-enters.
371                tc.end_host_trace();
372
373                // Handle the guest trace data if any
374                if let Err(e) = hv.handle_trace(&mut tc) {
375                    // If no trace data is available, we just log a message and continue
376                    // Is this the right thing to do?
377                    log::debug!("Error handling guest trace: {:?}", e);
378                }
379
380                result
381            };
382            #[cfg(not(feature = "trace_guest"))]
383            let result = hv.run();
384
385            match result {
386                #[cfg(gdb)]
387                Ok(HyperlightExit::Debug(stop_reason)) => {
388                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
389                        log_then_return!(e);
390                    }
391                }
392
393                Ok(HyperlightExit::Halt()) => {
394                    break;
395                }
396                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
397                    hv.handle_io(port, data, rip, instruction_length)?
398                }
399                Ok(HyperlightExit::Mmio(addr)) => {
400                    #[cfg(crashdump)]
401                    crashdump::generate_crashdump(hv)?;
402
403                    if !hv.check_stack_guard()? {
404                        log_then_return!(StackOverflow());
405                    }
406
407                    log_then_return!("MMIO access address {:#x}", addr);
408                }
409                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
410                    #[cfg(crashdump)]
411                    crashdump::generate_crashdump(hv)?;
412
413                    // If GDB is enabled, we handle the debug memory access
414                    // Disregard return value as we want to return the error
415                    #[cfg(gdb)]
416                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
417
418                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
419                        return Err(HyperlightError::StackOverflow());
420                    }
421                    log_then_return!(HyperlightError::MemoryAccessViolation(
422                        addr,
423                        tried,
424                        region_permission
425                    ));
426                }
427                Ok(HyperlightExit::Cancelled()) => {
428                    // Shutdown is returned when the host has cancelled execution
429                    // After termination, the main thread will re-initialize the VM
430                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
431                    log_then_return!(ExecutionCanceledByHost());
432                }
433                Ok(HyperlightExit::Unknown(reason)) => {
434                    #[cfg(crashdump)]
435                    crashdump::generate_crashdump(hv)?;
436                    // If GDB is enabled, we handle the debug memory access
437                    // Disregard return value as we want to return the error
438                    #[cfg(gdb)]
439                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
440
441                    log_then_return!("Unexpected VM Exit {:?}", reason);
442                }
443                Ok(HyperlightExit::Retry()) => {
444                    debug!("[VCPU] Retry - continuing VM run loop");
445                    continue;
446                }
447                Err(e) => {
448                    #[cfg(crashdump)]
449                    crashdump::generate_crashdump(hv)?;
450                    // If GDB is enabled, we handle the debug memory access
451                    // Disregard return value as we want to return the error
452                    #[cfg(gdb)]
453                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
454
455                    return Err(e);
456                }
457            }
458        }
459
460        Ok(())
461    }
462}
463
464/// A trait for handling interrupts to a sandbox's vcpu (public API)
465pub trait InterruptHandle: Debug + Send + Sync {
466    /// Interrupt the corresponding sandbox from running.
467    ///
468    /// This method attempts to cancel a currently executing guest function call by sending
469    /// a signal to the VCPU thread. It uses generation tracking and call_active flag to
470    /// ensure the interruption is safe and precise.
471    ///
472    /// # Behavior
473    ///
474    /// - **Guest function running**: If called while a guest function is executing (VCPU running
475    ///   or in a host function call), this stamps the current generation into cancel_requested
476    ///   and sends a signal to interrupt the VCPU. Returns `true`.
477    ///
478    /// - **No active call**: If called when no guest function call is in progress (call_active=false),
479    ///   this has no effect and returns `false`. This prevents "kill-in-advance" where kill()
480    ///   is called before a guest function starts.
481    ///
482    /// - **During host function**: If the guest call is currently executing a host function
483    ///   (VCPU not running but call_active=true), this stamps cancel_requested. When the
484    ///   host function returns and attempts to re-enter the guest, the cancellation will
485    ///   be detected and the call will abort. Returns `true`.
486    ///
487    /// # Generation Tracking
488    ///
489    /// The method stamps the current generation number along with the cancellation request.
490    /// This ensures that:
491    /// - Stale signals from previous calls are ignored (generation mismatch)
492    /// - Only the intended guest function call is affected
493    /// - Multiple rapid kill() calls on the same generation are idempotent
494    ///
495    /// # Blocking Behavior
496    ///
497    /// This function will block while attempting to deliver the signal to the VCPU thread,
498    /// retrying until either:
499    /// - The signal is successfully delivered (VCPU transitions from running to not running)
500    /// - The VCPU stops running for another reason (e.g., call completes normally)
501    ///
502    /// # Returns
503    ///
504    /// - `true`: Cancellation request was stamped (kill will take effect)
505    /// - `false`: No active call, cancellation request was not stamped (no effect)
506    ///
507    /// # Note
508    ///
509    /// To reliably interrupt a guest call, ensure `kill()` is called while the guest
510    /// function is actually executing. Calling kill() before call_guest_function() will
511    /// have no effect.
512    fn kill(&self) -> bool;
513
514    /// Used by a debugger to interrupt the corresponding sandbox from running.
515    ///
516    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
517    /// - If this is called while the vcpu is not running, (for example during a host call), the
518    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
519    ///   it's scheduled, and returns `false`.
520    ///
521    /// # Note
522    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
523    #[cfg(gdb)]
524    fn kill_from_debugger(&self) -> bool;
525
526    /// Check if the corresponding VM has been dropped.
527    fn dropped(&self) -> bool;
528}
529
530/// Internal trait for interrupt handle implementation details (private, cross-platform).
531///
532/// This trait contains all the internal atomics access methods and helper functions
533/// that are shared between Linux and Windows implementations. It extends InterruptHandle
534/// to inherit the public API.
535///
536/// This trait should NOT be used outside of hypervisor implementations.
537pub(crate) trait InterruptHandleInternal: InterruptHandle {
538    /// Returns the call_active atomic bool reference for internal implementations.
539    fn get_call_active(&self) -> &AtomicBool;
540
541    /// Returns the running atomic u64 reference for internal implementations.
542    fn get_running(&self) -> &AtomicU64;
543
544    /// Returns the cancel_requested atomic u64 reference for internal implementations.
545    fn get_cancel_requested(&self) -> &AtomicU64;
546
547    /// Set call_active - increments generation and sets flag.
548    ///
549    /// Increments the generation counter and sets the call_active flag to true,
550    /// indicating that a guest function call is now in progress. This allows
551    /// kill() to stamp cancel_requested with the correct generation.
552    ///
553    /// Must be called at the start of call_guest_function_by_name_no_reset(),
554    /// before any VCPU execution begins.
555    ///
556    /// Returns true if call_active was already set (indicating a guard already exists),
557    /// false otherwise.
558    fn set_call_active(&self) -> bool {
559        self.increment_generation();
560        self.get_call_active().swap(true, Ordering::AcqRel)
561    }
562
563    /// Clear call_active - clears the call_active flag.
564    ///
565    /// Clears the call_active flag, indicating that no guest function call is
566    /// in progress. After this, kill() will have no effect and will return false.
567    ///
568    /// Must be called at the end of call_guest_function_by_name_no_reset(),
569    /// after the guest call has fully completed (whether successfully or with error).
570    fn clear_call_active(&self) {
571        self.get_call_active().store(false, Ordering::Release)
572    }
573
574    /// Set cancel_requested to true with the given generation.
575    ///
576    /// This stamps the cancellation request with the current generation number,
577    /// ensuring that only the VCPU running with this exact generation will honor
578    /// the cancellation.
579    fn set_cancel_requested(&self, generation: u64) {
580        const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
581        const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
582        let value = CANCEL_REQUESTED_BIT | (generation & MAX_GENERATION);
583        self.get_cancel_requested().store(value, Ordering::Release);
584    }
585
586    /// Clear cancel_requested (reset to no cancellation).
587    ///
588    /// This is called after a cancellation has been processed to reset the
589    /// cancellation flag for the next guest call.
590    fn clear_cancel_requested(&self) {
591        self.get_cancel_requested().store(0, Ordering::Release);
592    }
593
594    /// Check if cancel_requested is set for the given generation.
595    ///
596    /// Returns true only if BOTH:
597    /// - The cancellation flag is set
598    /// - The stored generation matches the provided generation
599    ///
600    /// This prevents stale cancellations from affecting new guest calls.
601    fn is_cancel_requested_for_generation(&self, generation: u64) -> bool {
602        const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
603        const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
604        let raw = self.get_cancel_requested().load(Ordering::Acquire);
605        let is_set = raw & CANCEL_REQUESTED_BIT != 0;
606        let stored_generation = raw & MAX_GENERATION;
607        is_set && stored_generation == generation
608    }
609
610    /// Set running bit to true, return current generation.
611    ///
612    /// This is called when the VCPU is about to enter guest mode. It atomically
613    /// sets the running flag while preserving the generation counter.
614    fn set_running_bit(&self) -> u64 {
615        const RUNNING_BIT: u64 = 1 << 63;
616        self.get_running()
617            .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
618                Some(raw | RUNNING_BIT)
619            })
620            .map(|raw| raw & !RUNNING_BIT) // Return the current generation
621            .unwrap_or(0)
622    }
623
624    /// Increment the generation for a new guest function call.
625    ///
626    /// The generation counter wraps around at MAX_GENERATION (2^63 - 1).
627    /// This is called at the start of each new guest function call to provide
628    /// a unique identifier that prevents ABA problems with stale cancellations.
629    ///
630    /// Returns the NEW generation number (after incrementing).
631    fn increment_generation(&self) -> u64 {
632        const RUNNING_BIT: u64 = 1 << 63;
633        const MAX_GENERATION: u64 = RUNNING_BIT - 1;
634        self.get_running()
635            .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
636                let current_generation = raw & !RUNNING_BIT;
637                let running_bit = raw & RUNNING_BIT;
638                if current_generation == MAX_GENERATION {
639                    // Restart generation from 0
640                    return Some(running_bit);
641                }
642                Some((current_generation + 1) | running_bit)
643            })
644            .map(|raw| (raw & !RUNNING_BIT) + 1) // Return the NEW generation
645            .unwrap_or(1) // If wrapped, return 1
646    }
647
648    /// Get the current running state and generation counter.
649    ///
650    /// Returns a tuple of (running, generation) where:
651    /// - running: true if VCPU is currently in guest mode
652    /// - generation: current generation counter value
653    fn get_running_and_generation(&self) -> (bool, u64) {
654        const RUNNING_BIT: u64 = 1 << 63;
655        let raw = self.get_running().load(Ordering::Acquire);
656        let running = raw & RUNNING_BIT != 0;
657        let generation = raw & !RUNNING_BIT;
658        (running, generation)
659    }
660
661    /// Clear the running bit and return the old value.
662    ///
663    /// This is called when the VCPU exits from guest mode back to host mode.
664    /// The return value (which includes the generation and the old running bit)
665    /// is currently unused by all callers.
666    fn clear_running_bit(&self) -> u64 {
667        const RUNNING_BIT: u64 = 1 << 63;
668        self.get_running()
669            .fetch_and(!RUNNING_BIT, Ordering::Release)
670    }
671}
672
673#[cfg(any(kvm, mshv3))]
674#[derive(Debug)]
675pub(super) struct LinuxInterruptHandle {
676    /// Atomic flag combining running state and generation counter.
677    ///
678    /// **Bit 63**: VCPU running state (1 = running, 0 = not running)
679    /// **Bits 0-62**: Generation counter (incremented once per guest function call)
680    ///
681    /// # Generation Tracking
682    ///
683    /// The generation counter is incremented once at the start of each guest function call
684    /// and remains constant throughout that call, even if the VCPU is run multiple times
685    /// (due to host function calls, retries, etc.). This design solves the race condition
686    /// where a kill() from a previous call could spuriously cancel a new call.
687    ///
688    /// ## Why Generations Are Needed
689    ///
690    /// Consider this scenario WITHOUT generation tracking:
691    /// 1. Thread A starts guest call 1, VCPU runs
692    /// 2. Thread B calls kill(), sends signal to Thread A
693    /// 3. Guest call 1 completes before signal arrives
694    /// 4. Thread A starts guest call 2, VCPU runs again
695    /// 5. Stale signal from step 2 arrives and incorrectly cancels call 2
696    ///
697    /// WITH generation tracking:
698    /// 1. Thread A starts guest call 1 (generation N), VCPU runs
699    /// 2. Thread B calls kill(), stamps cancel_requested with generation N
700    /// 3. Guest call 1 completes, signal may or may not have arrived yet
701    /// 4. Thread A starts guest call 2 (generation N+1), VCPU runs again
702    /// 5. If stale signal arrives, signal handler checks: cancel_requested.generation (N) != current generation (N+1)
703    /// 6. Stale signal is ignored, call 2 continues normally
704    ///
705    /// ## Per-Call vs Per-Run Generation
706    ///
707    /// It's critical that generation is incremented per GUEST FUNCTION CALL, not per vcpu.run():
708    /// - A single guest function call may invoke vcpu.run() multiple times (host calls, retries)
709    /// - All run() calls within the same guest call must share the same generation
710    /// - This ensures kill() affects the entire guest function call atomically
711    ///
712    /// # Invariants
713    ///
714    /// - If VCPU is running: bit 63 is set (neither converse nor inverse holds)
715    /// - If VCPU is running: bits 0-62 match the current guest call's generation
716    running: AtomicU64,
717
718    /// Thread ID where the VCPU is currently running.
719    ///
720    /// # Invariants
721    ///
722    /// - If VCPU is running: tid contains the thread ID of the executing thread
723    /// - Multiple VMs may share the same tid, but at most one will have running=true
724    tid: AtomicU64,
725
726    /// Generation-aware cancellation request flag.
727    ///
728    /// **Bit 63**: Cancellation requested flag (1 = kill requested, 0 = no kill)
729    /// **Bits 0-62**: Generation number when cancellation was requested
730    ///
731    /// # Purpose
732    ///
733    /// This flag serves three critical functions:
734    ///
735    /// 1. **Prevent stale signals**: A VCPU may only be interrupted if cancel_requested
736    ///    is set AND the generation matches the current call's generation
737    ///
738    /// 2. **Handle host function calls**: If kill() is called while a host function is
739    ///    executing (VCPU not running but call is active), cancel_requested is stamped
740    ///    with the current generation. When the host function returns and the VCPU
741    ///    attempts to re-enter the guest, it will see the cancellation and abort.
742    ///
743    /// 3. **Detect stale kills**: If cancel_requested.generation doesn't match the
744    ///    current generation, it's from a previous call and should be ignored
745    ///
746    /// # States and Transitions
747    ///
748    /// - **No cancellation**: cancel_requested = 0 (bit 63 clear)
749    /// - **Cancellation for generation N**: cancel_requested = (1 << 63) | N
750    /// - Signal handler checks: (cancel_requested & 0x7FFFFFFFFFFFFFFF) == current_generation
751    cancel_requested: AtomicU64,
752
753    /// Flag indicating whether a guest function call is currently in progress.
754    ///
755    /// **true**: A guest function call is active (between call start and completion)
756    /// **false**: No guest function call is active
757    ///
758    /// # Purpose
759    ///
760    /// This flag prevents kill() from having any effect when called outside of a
761    /// guest function call. This solves the "kill-in-advance" problem where kill()
762    /// could be called before a guest function starts and would incorrectly cancel it.
763    ///
764    /// # Behavior
765    ///
766    /// - Set to true at the start of call_guest_function_by_name_no_reset()
767    /// - Cleared at the end of call_guest_function_by_name_no_reset()
768    /// - kill() only stamps cancel_requested if call_active is true
769    /// - If kill() is called when call_active=false, it returns false and has no effect
770    ///
771    /// # Why AtomicBool is Safe
772    ///
773    /// Although there's a theoretical race where:
774    /// 1. Thread A checks call_active (false)
775    /// 2. Thread B sets call_active (true) and starts guest call
776    /// 3. Thread A's kill() returns false (no effect)
777    ///
778    /// This is acceptable because the generation tracking provides an additional
779    /// safety layer. Even if a stale kill somehow stamped cancel_requested, the
780    /// generation mismatch would cause it to be ignored.
781    call_active: AtomicBool,
782
783    /// Debugger interrupt request flag (GDB only).
784    ///
785    /// Set when kill_from_debugger() is called, cleared when VCPU stops running.
786    /// Used to distinguish debugger interrupts from normal kill() interrupts.
787    #[cfg(gdb)]
788    debug_interrupt: AtomicBool,
789
790    /// Whether the corresponding VM has been dropped.
791    dropped: AtomicBool,
792
793    /// Delay between retry attempts when sending signals to the VCPU thread.
794    retry_delay: Duration,
795
796    /// Offset from SIGRTMIN for the signal used to interrupt the VCPU thread.
797    sig_rt_min_offset: u8,
798}
799
800#[cfg(any(kvm, mshv3))]
801impl LinuxInterruptHandle {
802    fn send_signal(&self, stamp_generation: bool) -> bool {
803        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
804        let mut sent_signal = false;
805        let mut target_generation: Option<u64> = None;
806
807        loop {
808            if !self.call_active.load(Ordering::Acquire) {
809                // No active call, so no need to send signal
810                break;
811            }
812
813            let (running, generation) = self.get_running_and_generation();
814
815            // Stamp generation into cancel_requested if requested and this is the first iteration
816            // We stamp even when running=false to support killing during host function calls
817            // The generation tracking will prevent stale kills from affecting new calls
818            // Only stamp if a call is actually active (call_active=true)
819            if stamp_generation
820                && target_generation.is_none()
821                && self.call_active.load(Ordering::Acquire)
822            {
823                self.set_cancel_requested(generation);
824                target_generation = Some(generation);
825            }
826
827            // If not running, we've stamped the generation (if requested), so we're done
828            // This handles the host function call scenario
829            if !running {
830                break;
831            }
832
833            match target_generation {
834                None => target_generation = Some(generation),
835                // prevent ABA problem
836                Some(expected) if expected != generation => break,
837                _ => {}
838            }
839
840            log::info!("Sending signal to kill vcpu thread...");
841            sent_signal = true;
842            unsafe {
843                libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
844            }
845            std::thread::sleep(self.retry_delay);
846        }
847
848        sent_signal
849    }
850}
851
852#[cfg(any(kvm, mshv3))]
853impl InterruptHandle for LinuxInterruptHandle {
854    fn kill(&self) -> bool {
855        if !(self.call_active.load(Ordering::Acquire)) {
856            // No active call, so no effect
857            return false;
858        }
859
860        // send_signal will stamp the generation into cancel_requested
861        // right before sending each signal, ensuring they're always in sync
862        self.send_signal(true)
863    }
864
865    #[cfg(gdb)]
866    fn kill_from_debugger(&self) -> bool {
867        self.debug_interrupt.store(true, Ordering::Relaxed);
868        self.send_signal(false)
869    }
870
871    fn dropped(&self) -> bool {
872        self.dropped.load(Ordering::Relaxed)
873    }
874}
875
876#[cfg(any(kvm, mshv3))]
877impl InterruptHandleInternal for LinuxInterruptHandle {
878    fn get_call_active(&self) -> &AtomicBool {
879        &self.call_active
880    }
881
882    fn get_running(&self) -> &AtomicU64 {
883        &self.running
884    }
885
886    fn get_cancel_requested(&self) -> &AtomicU64 {
887        &self.cancel_requested
888    }
889}
890
891#[cfg(all(test, any(target_os = "windows", kvm)))]
892pub(crate) mod tests {
893    use std::sync::{Arc, Mutex};
894
895    use hyperlight_testing::dummy_guest_as_string;
896
897    use crate::sandbox::uninitialized::GuestBinary;
898    #[cfg(any(crashdump, gdb))]
899    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
900    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
901    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
902    use crate::{Result, is_hypervisor_present, new_error};
903
904    #[test]
905    fn test_initialise() -> Result<()> {
906        if !is_hypervisor_present() {
907            return Ok(());
908        }
909
910        use crate::mem::ptr::RawPtr;
911        use crate::sandbox::host_funcs::FunctionRegistry;
912
913        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
914
915        let config: SandboxConfiguration = Default::default();
916        #[cfg(any(crashdump, gdb))]
917        let rt_cfg: SandboxRuntimeConfig = Default::default();
918        let sandbox =
919            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
920        let (mem_mgr, mut gshm) = sandbox.mgr.build();
921        let mut vm = set_up_hypervisor_partition(
922            &mut gshm,
923            &config,
924            #[cfg(any(crashdump, gdb))]
925            &rt_cfg,
926            sandbox.load_info,
927        )?;
928
929        // Set up required parameters for initialise
930        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
931        let seed = 12345u64; // Random seed
932        let page_size = 4096u32; // Standard page size
933        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
934        let guest_max_log_level = Some(log::LevelFilter::Error);
935
936        #[cfg(gdb)]
937        let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
938
939        // Test the initialise method
940        vm.initialise(
941            peb_addr,
942            seed,
943            page_size,
944            mem_mgr,
945            host_funcs,
946            guest_max_log_level,
947            #[cfg(gdb)]
948            dbg_mem_access_fn,
949        )?;
950
951        Ok(())
952    }
953}