hyperlight_host/hypervisor/mod.rs
1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::{LevelFilter, debug};
18use tracing::{Span, instrument};
19
20use crate::HyperlightError::StackOverflow;
21use crate::error::HyperlightError::ExecutionCanceledByHost;
22use crate::hypervisor::regs::{
23 CommonFpu, CommonRegisters, CommonSegmentRegister, CommonSpecialRegisters,
24};
25use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
26use crate::metrics::METRIC_GUEST_CANCELLATION;
27#[cfg(feature = "mem_profile")]
28use crate::sandbox::trace::MemTraceInfo;
29use crate::{HyperlightError, Result, log_then_return};
30
31/// HyperV-on-linux functionality
32#[cfg(mshv3)]
33pub mod hyperv_linux;
34#[cfg(target_os = "windows")]
35/// Hyperv-on-windows functionality
36pub(crate) mod hyperv_windows;
37
38/// GDB debugging support
39#[cfg(gdb)]
40pub(crate) mod gdb;
41
42/// Abstracts over different hypervisor register representations
43pub(crate) mod regs;
44
45#[cfg(kvm)]
46/// Functionality to manipulate KVM-based virtual machines
47pub mod kvm;
48#[cfg(target_os = "windows")]
49/// Hyperlight Surrogate Process
50pub(crate) mod surrogate_process;
51#[cfg(target_os = "windows")]
52/// Hyperlight Surrogate Process
53pub(crate) mod surrogate_process_manager;
54/// WindowsHypervisorPlatform utilities
55#[cfg(target_os = "windows")]
56pub(crate) mod windows_hypervisor_platform;
57/// Safe wrappers around windows types like `PSTR`
58#[cfg(target_os = "windows")]
59pub(crate) mod wrappers;
60
61#[cfg(crashdump)]
62pub(crate) mod crashdump;
63
64use std::fmt::Debug;
65use std::str::FromStr;
66use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
67use std::sync::{Arc, Mutex};
68#[cfg(any(kvm, mshv3))]
69use std::time::Duration;
70
71#[cfg(gdb)]
72use gdb::VcpuStopReason;
73
74use crate::mem::mgr::SandboxMemoryManager;
75use crate::mem::ptr::RawPtr;
76use crate::mem::shared_mem::HostSharedMemory;
77use crate::sandbox::host_funcs::FunctionRegistry;
78
79cfg_if::cfg_if! {
80 if #[cfg(feature = "init-paging")] {
81 pub(crate) const CR4_PAE: u64 = 1 << 5;
82 pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
83 pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
84 pub(crate) const CR0_PE: u64 = 1;
85 pub(crate) const CR0_MP: u64 = 1 << 1;
86 pub(crate) const CR0_ET: u64 = 1 << 4;
87 pub(crate) const CR0_NE: u64 = 1 << 5;
88 pub(crate) const CR0_WP: u64 = 1 << 16;
89 pub(crate) const CR0_AM: u64 = 1 << 18;
90 pub(crate) const CR0_PG: u64 = 1 << 31;
91 pub(crate) const EFER_LME: u64 = 1 << 8;
92 pub(crate) const EFER_LMA: u64 = 1 << 10;
93 pub(crate) const EFER_SCE: u64 = 1;
94 pub(crate) const EFER_NX: u64 = 1 << 11;
95 }
96}
97
98/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
99/// the hypervisor specific exit reasons to these generic ones
100pub enum HyperlightExit {
101 #[cfg(gdb)]
102 /// The vCPU has exited due to a debug event
103 Debug(VcpuStopReason),
104 /// The vCPU has halted
105 Halt(),
106 /// The vCPU has issued a write to the given port with the given value
107 IoOut(u16, Vec<u8>, u64, u64),
108 /// The vCPU has attempted to read or write from an unmapped address
109 Mmio(u64),
110 /// The vCPU tried to access memory but was missing the required permissions
111 AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
112 /// The vCPU execution has been cancelled
113 Cancelled(),
114 /// The vCPU has exited for a reason that is not handled by Hyperlight
115 Unknown(String),
116 /// The operation should be retried
117 /// On Linux this can happen where a call to run the CPU can return EAGAIN
118 /// On Windows the platform could cause a cancelation of the VM run
119 Retry(),
120}
121
122/// A common set of hypervisor functionality
123pub(crate) trait Hypervisor: Debug + Send {
124 /// Initialise the internally stored vCPU with the given PEB address and
125 /// random number seed, then run it until a HLT instruction.
126 #[allow(clippy::too_many_arguments)]
127 fn initialise(
128 &mut self,
129 peb_addr: RawPtr,
130 seed: u64,
131 page_size: u32,
132 mem_mgr: SandboxMemoryManager<HostSharedMemory>,
133 host_funcs: Arc<Mutex<FunctionRegistry>>,
134 guest_max_log_level: Option<LevelFilter>,
135 #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
136 ) -> Result<()>;
137
138 /// Map a region of host memory into the sandbox.
139 ///
140 /// Depending on the host platform, there are likely alignment
141 /// requirements of at least one page for base and len.
142 unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
143
144 /// Unmap a memory region from the sandbox
145 unsafe fn unmap_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
146
147 /// Get the currently mapped dynamic memory regions (not including sandbox regions)
148 ///
149 /// Note: Box needed for trait to be object-safe :(
150 fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_>;
151
152 /// Dispatch a call from the host to the guest using the given pointer
153 /// to the dispatch function _in the guest's address space_.
154 ///
155 /// Do this by setting the instruction pointer to `dispatch_func_addr`
156 /// and then running the execution loop until a halt instruction.
157 ///
158 /// Returns `Ok` if the call succeeded, and an `Err` if it failed
159 fn dispatch_call_from_host(
160 &mut self,
161 dispatch_func_addr: RawPtr,
162 #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
163 ) -> Result<()>;
164
165 /// Handle an IO exit from the internally stored vCPU.
166 fn handle_io(
167 &mut self,
168 port: u16,
169 data: Vec<u8>,
170 rip: u64,
171 instruction_length: u64,
172 ) -> Result<()>;
173
174 /// Run the vCPU
175 fn run(
176 &mut self,
177 #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext,
178 ) -> Result<HyperlightExit>;
179
180 /// Get InterruptHandle to underlying VM (returns internal trait)
181 fn interrupt_handle(&self) -> Arc<dyn InterruptHandleInternal>;
182
183 /// Get regs
184 #[allow(dead_code)]
185 fn regs(&self) -> Result<CommonRegisters>;
186 /// Set regs
187 #[allow(dead_code)]
188 fn set_regs(&mut self, regs: &CommonRegisters) -> Result<()>;
189 /// Get fpu regs
190 #[allow(dead_code)]
191 fn fpu(&self) -> Result<CommonFpu>;
192 /// Set fpu regs
193 #[allow(dead_code)]
194 fn set_fpu(&mut self, fpu: &CommonFpu) -> Result<()>;
195 /// Get special regs
196 #[allow(dead_code)]
197 fn sregs(&self) -> Result<CommonSpecialRegisters>;
198 /// Set special regs
199 #[allow(dead_code)]
200 fn set_sregs(&mut self, sregs: &CommonSpecialRegisters) -> Result<()>;
201
202 /// Setup initial special registers for the hypervisor
203 /// This is a default implementation that works for all hypervisors
204 fn setup_initial_sregs(&mut self, _pml4_addr: u64) -> Result<()> {
205 #[cfg(feature = "init-paging")]
206 let sregs = CommonSpecialRegisters {
207 cr0: CR0_PE | CR0_MP | CR0_ET | CR0_NE | CR0_AM | CR0_PG | CR0_WP,
208 cr4: CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT,
209 cr3: _pml4_addr,
210 efer: EFER_LME | EFER_LMA | EFER_SCE | EFER_NX,
211 cs: CommonSegmentRegister {
212 type_: 11,
213 present: 1,
214 s: 1,
215 l: 1,
216 ..Default::default()
217 },
218 tr: CommonSegmentRegister {
219 limit: 65535,
220 type_: 11,
221 present: 1,
222 s: 0,
223 ..Default::default()
224 },
225 ..Default::default()
226 };
227
228 #[cfg(not(feature = "init-paging"))]
229 let sregs = CommonSpecialRegisters {
230 cs: CommonSegmentRegister {
231 base: 0,
232 selector: 0,
233 limit: 0xFFFF,
234 type_: 11,
235 present: 1,
236 s: 1,
237 ..Default::default()
238 },
239 ds: CommonSegmentRegister {
240 base: 0,
241 selector: 0,
242 limit: 0xFFFF,
243 type_: 3,
244 present: 1,
245 s: 1,
246 ..Default::default()
247 },
248 tr: CommonSegmentRegister {
249 base: 0,
250 selector: 0,
251 limit: 0xFFFF,
252 type_: 11,
253 present: 1,
254 s: 0,
255 ..Default::default()
256 },
257 ..Default::default()
258 };
259
260 self.set_sregs(&sregs)?;
261 Ok(())
262 }
263
264 /// Get the logging level to pass to the guest entrypoint
265 fn get_max_log_level(&self) -> u32 {
266 // Check to see if the RUST_LOG environment variable is set
267 // and if so, parse it to get the log_level for hyperlight_guest
268 // if that is not set get the log level for the hyperlight_host
269
270 // This is done as the guest will produce logs based on the log level returned here
271 // producing those logs is expensive and we don't want to do it if the host is not
272 // going to process them
273
274 let val = std::env::var("RUST_LOG").unwrap_or_default();
275
276 let level = if val.contains("hyperlight_guest") {
277 val.split(',')
278 .find(|s| s.contains("hyperlight_guest"))
279 .unwrap_or("")
280 .split('=')
281 .nth(1)
282 .unwrap_or("")
283 } else if val.contains("hyperlight_host") {
284 val.split(',')
285 .find(|s| s.contains("hyperlight_host"))
286 .unwrap_or("")
287 .split('=')
288 .nth(1)
289 .unwrap_or("")
290 } else {
291 // look for a value string that does not contain "="
292 val.split(',').find(|s| !s.contains("=")).unwrap_or("")
293 };
294
295 log::info!("Determined guest log level: {}", level);
296 // Convert the log level string to a LevelFilter
297 // If no value is found, default to Error
298 LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
299 }
300
301 /// get a mutable trait object from self
302 fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
303
304 #[cfg(crashdump)]
305 fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>>;
306
307 #[cfg(gdb)]
308 /// handles the cases when the vCPU stops due to a Debug event
309 fn handle_debug(
310 &mut self,
311 _dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
312 _stop_reason: VcpuStopReason,
313 ) -> Result<()> {
314 unimplemented!()
315 }
316
317 /// Check stack guard to see if the stack is still valid
318 fn check_stack_guard(&self) -> Result<bool>;
319
320 #[cfg(feature = "trace_guest")]
321 fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()>;
322
323 /// Get a mutable reference of the trace info for the guest
324 #[cfg(feature = "mem_profile")]
325 fn trace_info_mut(&mut self) -> &mut MemTraceInfo;
326}
327
328/// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
329/// access its corresponding region. Returns None otherwise, or if the region is not found.
330pub(crate) fn get_memory_access_violation<'a>(
331 gpa: usize,
332 mut mem_regions: impl Iterator<Item = &'a MemoryRegion>,
333 access_info: MemoryRegionFlags,
334) -> Option<HyperlightExit> {
335 // find the region containing the given gpa
336 let region = mem_regions.find(|region| region.guest_region.contains(&gpa));
337
338 if let Some(region) = region
339 && (!region.flags.contains(access_info)
340 || region.flags.contains(MemoryRegionFlags::STACK_GUARD))
341 {
342 return Some(HyperlightExit::AccessViolation(
343 gpa as u64,
344 access_info,
345 region.flags,
346 ));
347 }
348 None
349}
350
351/// A virtual CPU that can be run until an exit occurs
352pub struct VirtualCPU {}
353
354impl VirtualCPU {
355 /// Run the given hypervisor until a halt instruction is reached
356 #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
357 pub(crate) fn run(
358 hv: &mut dyn Hypervisor,
359 #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
360 ) -> Result<()> {
361 // Keeps the trace context and open spans
362 #[cfg(feature = "trace_guest")]
363 let mut tc = crate::sandbox::trace::TraceContext::new();
364
365 loop {
366 #[cfg(feature = "trace_guest")]
367 let result = {
368 let result = hv.run(&mut tc);
369 // End current host trace by closing the current span that captures traces
370 // happening when a guest exits and re-enters.
371 tc.end_host_trace();
372
373 // Handle the guest trace data if any
374 if let Err(e) = hv.handle_trace(&mut tc) {
375 // If no trace data is available, we just log a message and continue
376 // Is this the right thing to do?
377 log::debug!("Error handling guest trace: {:?}", e);
378 }
379
380 result
381 };
382 #[cfg(not(feature = "trace_guest"))]
383 let result = hv.run();
384
385 match result {
386 #[cfg(gdb)]
387 Ok(HyperlightExit::Debug(stop_reason)) => {
388 if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
389 log_then_return!(e);
390 }
391 }
392
393 Ok(HyperlightExit::Halt()) => {
394 break;
395 }
396 Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
397 hv.handle_io(port, data, rip, instruction_length)?
398 }
399 Ok(HyperlightExit::Mmio(addr)) => {
400 #[cfg(crashdump)]
401 crashdump::generate_crashdump(hv)?;
402
403 if !hv.check_stack_guard()? {
404 log_then_return!(StackOverflow());
405 }
406
407 log_then_return!("MMIO access address {:#x}", addr);
408 }
409 Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
410 #[cfg(crashdump)]
411 crashdump::generate_crashdump(hv)?;
412
413 // If GDB is enabled, we handle the debug memory access
414 // Disregard return value as we want to return the error
415 #[cfg(gdb)]
416 let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
417
418 if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
419 return Err(HyperlightError::StackOverflow());
420 }
421 log_then_return!(HyperlightError::MemoryAccessViolation(
422 addr,
423 tried,
424 region_permission
425 ));
426 }
427 Ok(HyperlightExit::Cancelled()) => {
428 // Shutdown is returned when the host has cancelled execution
429 // After termination, the main thread will re-initialize the VM
430 metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
431 log_then_return!(ExecutionCanceledByHost());
432 }
433 Ok(HyperlightExit::Unknown(reason)) => {
434 #[cfg(crashdump)]
435 crashdump::generate_crashdump(hv)?;
436 // If GDB is enabled, we handle the debug memory access
437 // Disregard return value as we want to return the error
438 #[cfg(gdb)]
439 let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
440
441 log_then_return!("Unexpected VM Exit {:?}", reason);
442 }
443 Ok(HyperlightExit::Retry()) => {
444 debug!("[VCPU] Retry - continuing VM run loop");
445 continue;
446 }
447 Err(e) => {
448 #[cfg(crashdump)]
449 crashdump::generate_crashdump(hv)?;
450 // If GDB is enabled, we handle the debug memory access
451 // Disregard return value as we want to return the error
452 #[cfg(gdb)]
453 let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
454
455 return Err(e);
456 }
457 }
458 }
459
460 Ok(())
461 }
462}
463
464/// A trait for handling interrupts to a sandbox's vcpu (public API)
465pub trait InterruptHandle: Debug + Send + Sync {
466 /// Interrupt the corresponding sandbox from running.
467 ///
468 /// This method attempts to cancel a currently executing guest function call by sending
469 /// a signal to the VCPU thread. It uses generation tracking and call_active flag to
470 /// ensure the interruption is safe and precise.
471 ///
472 /// # Behavior
473 ///
474 /// - **Guest function running**: If called while a guest function is executing (VCPU running
475 /// or in a host function call), this stamps the current generation into cancel_requested
476 /// and sends a signal to interrupt the VCPU. Returns `true`.
477 ///
478 /// - **No active call**: If called when no guest function call is in progress (call_active=false),
479 /// this has no effect and returns `false`. This prevents "kill-in-advance" where kill()
480 /// is called before a guest function starts.
481 ///
482 /// - **During host function**: If the guest call is currently executing a host function
483 /// (VCPU not running but call_active=true), this stamps cancel_requested. When the
484 /// host function returns and attempts to re-enter the guest, the cancellation will
485 /// be detected and the call will abort. Returns `true`.
486 ///
487 /// # Generation Tracking
488 ///
489 /// The method stamps the current generation number along with the cancellation request.
490 /// This ensures that:
491 /// - Stale signals from previous calls are ignored (generation mismatch)
492 /// - Only the intended guest function call is affected
493 /// - Multiple rapid kill() calls on the same generation are idempotent
494 ///
495 /// # Blocking Behavior
496 ///
497 /// This function will block while attempting to deliver the signal to the VCPU thread,
498 /// retrying until either:
499 /// - The signal is successfully delivered (VCPU transitions from running to not running)
500 /// - The VCPU stops running for another reason (e.g., call completes normally)
501 ///
502 /// # Returns
503 ///
504 /// - `true`: Cancellation request was stamped (kill will take effect)
505 /// - `false`: No active call, cancellation request was not stamped (no effect)
506 ///
507 /// # Note
508 ///
509 /// To reliably interrupt a guest call, ensure `kill()` is called while the guest
510 /// function is actually executing. Calling kill() before call_guest_function() will
511 /// have no effect.
512 fn kill(&self) -> bool;
513
514 /// Used by a debugger to interrupt the corresponding sandbox from running.
515 ///
516 /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
517 /// - If this is called while the vcpu is not running, (for example during a host call), the
518 /// vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
519 /// it's scheduled, and returns `false`.
520 ///
521 /// # Note
522 /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
523 #[cfg(gdb)]
524 fn kill_from_debugger(&self) -> bool;
525
526 /// Check if the corresponding VM has been dropped.
527 fn dropped(&self) -> bool;
528}
529
530/// Internal trait for interrupt handle implementation details (private, cross-platform).
531///
532/// This trait contains all the internal atomics access methods and helper functions
533/// that are shared between Linux and Windows implementations. It extends InterruptHandle
534/// to inherit the public API.
535///
536/// This trait should NOT be used outside of hypervisor implementations.
537pub(crate) trait InterruptHandleInternal: InterruptHandle {
538 /// Returns the call_active atomic bool reference for internal implementations.
539 fn get_call_active(&self) -> &AtomicBool;
540
541 /// Returns the running atomic u64 reference for internal implementations.
542 fn get_running(&self) -> &AtomicU64;
543
544 /// Returns the cancel_requested atomic u64 reference for internal implementations.
545 fn get_cancel_requested(&self) -> &AtomicU64;
546
547 /// Set call_active - increments generation and sets flag.
548 ///
549 /// Increments the generation counter and sets the call_active flag to true,
550 /// indicating that a guest function call is now in progress. This allows
551 /// kill() to stamp cancel_requested with the correct generation.
552 ///
553 /// Must be called at the start of call_guest_function_by_name_no_reset(),
554 /// before any VCPU execution begins.
555 ///
556 /// Returns true if call_active was already set (indicating a guard already exists),
557 /// false otherwise.
558 fn set_call_active(&self) -> bool {
559 self.increment_generation();
560 self.get_call_active().swap(true, Ordering::AcqRel)
561 }
562
563 /// Clear call_active - clears the call_active flag.
564 ///
565 /// Clears the call_active flag, indicating that no guest function call is
566 /// in progress. After this, kill() will have no effect and will return false.
567 ///
568 /// Must be called at the end of call_guest_function_by_name_no_reset(),
569 /// after the guest call has fully completed (whether successfully or with error).
570 fn clear_call_active(&self) {
571 self.get_call_active().store(false, Ordering::Release)
572 }
573
574 /// Set cancel_requested to true with the given generation.
575 ///
576 /// This stamps the cancellation request with the current generation number,
577 /// ensuring that only the VCPU running with this exact generation will honor
578 /// the cancellation.
579 fn set_cancel_requested(&self, generation: u64) {
580 const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
581 const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
582 let value = CANCEL_REQUESTED_BIT | (generation & MAX_GENERATION);
583 self.get_cancel_requested().store(value, Ordering::Release);
584 }
585
586 /// Clear cancel_requested (reset to no cancellation).
587 ///
588 /// This is called after a cancellation has been processed to reset the
589 /// cancellation flag for the next guest call.
590 fn clear_cancel_requested(&self) {
591 self.get_cancel_requested().store(0, Ordering::Release);
592 }
593
594 /// Check if cancel_requested is set for the given generation.
595 ///
596 /// Returns true only if BOTH:
597 /// - The cancellation flag is set
598 /// - The stored generation matches the provided generation
599 ///
600 /// This prevents stale cancellations from affecting new guest calls.
601 fn is_cancel_requested_for_generation(&self, generation: u64) -> bool {
602 const CANCEL_REQUESTED_BIT: u64 = 1 << 63;
603 const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1;
604 let raw = self.get_cancel_requested().load(Ordering::Acquire);
605 let is_set = raw & CANCEL_REQUESTED_BIT != 0;
606 let stored_generation = raw & MAX_GENERATION;
607 is_set && stored_generation == generation
608 }
609
610 /// Set running bit to true, return current generation.
611 ///
612 /// This is called when the VCPU is about to enter guest mode. It atomically
613 /// sets the running flag while preserving the generation counter.
614 fn set_running_bit(&self) -> u64 {
615 const RUNNING_BIT: u64 = 1 << 63;
616 self.get_running()
617 .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
618 Some(raw | RUNNING_BIT)
619 })
620 .map(|raw| raw & !RUNNING_BIT) // Return the current generation
621 .unwrap_or(0)
622 }
623
624 /// Increment the generation for a new guest function call.
625 ///
626 /// The generation counter wraps around at MAX_GENERATION (2^63 - 1).
627 /// This is called at the start of each new guest function call to provide
628 /// a unique identifier that prevents ABA problems with stale cancellations.
629 ///
630 /// Returns the NEW generation number (after incrementing).
631 fn increment_generation(&self) -> u64 {
632 const RUNNING_BIT: u64 = 1 << 63;
633 const MAX_GENERATION: u64 = RUNNING_BIT - 1;
634 self.get_running()
635 .fetch_update(Ordering::Release, Ordering::Acquire, |raw| {
636 let current_generation = raw & !RUNNING_BIT;
637 let running_bit = raw & RUNNING_BIT;
638 if current_generation == MAX_GENERATION {
639 // Restart generation from 0
640 return Some(running_bit);
641 }
642 Some((current_generation + 1) | running_bit)
643 })
644 .map(|raw| (raw & !RUNNING_BIT) + 1) // Return the NEW generation
645 .unwrap_or(1) // If wrapped, return 1
646 }
647
648 /// Get the current running state and generation counter.
649 ///
650 /// Returns a tuple of (running, generation) where:
651 /// - running: true if VCPU is currently in guest mode
652 /// - generation: current generation counter value
653 fn get_running_and_generation(&self) -> (bool, u64) {
654 const RUNNING_BIT: u64 = 1 << 63;
655 let raw = self.get_running().load(Ordering::Acquire);
656 let running = raw & RUNNING_BIT != 0;
657 let generation = raw & !RUNNING_BIT;
658 (running, generation)
659 }
660
661 /// Clear the running bit and return the old value.
662 ///
663 /// This is called when the VCPU exits from guest mode back to host mode.
664 /// The return value (which includes the generation and the old running bit)
665 /// is currently unused by all callers.
666 fn clear_running_bit(&self) -> u64 {
667 const RUNNING_BIT: u64 = 1 << 63;
668 self.get_running()
669 .fetch_and(!RUNNING_BIT, Ordering::Release)
670 }
671}
672
673#[cfg(any(kvm, mshv3))]
674#[derive(Debug)]
675pub(super) struct LinuxInterruptHandle {
676 /// Atomic flag combining running state and generation counter.
677 ///
678 /// **Bit 63**: VCPU running state (1 = running, 0 = not running)
679 /// **Bits 0-62**: Generation counter (incremented once per guest function call)
680 ///
681 /// # Generation Tracking
682 ///
683 /// The generation counter is incremented once at the start of each guest function call
684 /// and remains constant throughout that call, even if the VCPU is run multiple times
685 /// (due to host function calls, retries, etc.). This design solves the race condition
686 /// where a kill() from a previous call could spuriously cancel a new call.
687 ///
688 /// ## Why Generations Are Needed
689 ///
690 /// Consider this scenario WITHOUT generation tracking:
691 /// 1. Thread A starts guest call 1, VCPU runs
692 /// 2. Thread B calls kill(), sends signal to Thread A
693 /// 3. Guest call 1 completes before signal arrives
694 /// 4. Thread A starts guest call 2, VCPU runs again
695 /// 5. Stale signal from step 2 arrives and incorrectly cancels call 2
696 ///
697 /// WITH generation tracking:
698 /// 1. Thread A starts guest call 1 (generation N), VCPU runs
699 /// 2. Thread B calls kill(), stamps cancel_requested with generation N
700 /// 3. Guest call 1 completes, signal may or may not have arrived yet
701 /// 4. Thread A starts guest call 2 (generation N+1), VCPU runs again
702 /// 5. If stale signal arrives, signal handler checks: cancel_requested.generation (N) != current generation (N+1)
703 /// 6. Stale signal is ignored, call 2 continues normally
704 ///
705 /// ## Per-Call vs Per-Run Generation
706 ///
707 /// It's critical that generation is incremented per GUEST FUNCTION CALL, not per vcpu.run():
708 /// - A single guest function call may invoke vcpu.run() multiple times (host calls, retries)
709 /// - All run() calls within the same guest call must share the same generation
710 /// - This ensures kill() affects the entire guest function call atomically
711 ///
712 /// # Invariants
713 ///
714 /// - If VCPU is running: bit 63 is set (neither converse nor inverse holds)
715 /// - If VCPU is running: bits 0-62 match the current guest call's generation
716 running: AtomicU64,
717
718 /// Thread ID where the VCPU is currently running.
719 ///
720 /// # Invariants
721 ///
722 /// - If VCPU is running: tid contains the thread ID of the executing thread
723 /// - Multiple VMs may share the same tid, but at most one will have running=true
724 tid: AtomicU64,
725
726 /// Generation-aware cancellation request flag.
727 ///
728 /// **Bit 63**: Cancellation requested flag (1 = kill requested, 0 = no kill)
729 /// **Bits 0-62**: Generation number when cancellation was requested
730 ///
731 /// # Purpose
732 ///
733 /// This flag serves three critical functions:
734 ///
735 /// 1. **Prevent stale signals**: A VCPU may only be interrupted if cancel_requested
736 /// is set AND the generation matches the current call's generation
737 ///
738 /// 2. **Handle host function calls**: If kill() is called while a host function is
739 /// executing (VCPU not running but call is active), cancel_requested is stamped
740 /// with the current generation. When the host function returns and the VCPU
741 /// attempts to re-enter the guest, it will see the cancellation and abort.
742 ///
743 /// 3. **Detect stale kills**: If cancel_requested.generation doesn't match the
744 /// current generation, it's from a previous call and should be ignored
745 ///
746 /// # States and Transitions
747 ///
748 /// - **No cancellation**: cancel_requested = 0 (bit 63 clear)
749 /// - **Cancellation for generation N**: cancel_requested = (1 << 63) | N
750 /// - Signal handler checks: (cancel_requested & 0x7FFFFFFFFFFFFFFF) == current_generation
751 cancel_requested: AtomicU64,
752
753 /// Flag indicating whether a guest function call is currently in progress.
754 ///
755 /// **true**: A guest function call is active (between call start and completion)
756 /// **false**: No guest function call is active
757 ///
758 /// # Purpose
759 ///
760 /// This flag prevents kill() from having any effect when called outside of a
761 /// guest function call. This solves the "kill-in-advance" problem where kill()
762 /// could be called before a guest function starts and would incorrectly cancel it.
763 ///
764 /// # Behavior
765 ///
766 /// - Set to true at the start of call_guest_function_by_name_no_reset()
767 /// - Cleared at the end of call_guest_function_by_name_no_reset()
768 /// - kill() only stamps cancel_requested if call_active is true
769 /// - If kill() is called when call_active=false, it returns false and has no effect
770 ///
771 /// # Why AtomicBool is Safe
772 ///
773 /// Although there's a theoretical race where:
774 /// 1. Thread A checks call_active (false)
775 /// 2. Thread B sets call_active (true) and starts guest call
776 /// 3. Thread A's kill() returns false (no effect)
777 ///
778 /// This is acceptable because the generation tracking provides an additional
779 /// safety layer. Even if a stale kill somehow stamped cancel_requested, the
780 /// generation mismatch would cause it to be ignored.
781 call_active: AtomicBool,
782
783 /// Debugger interrupt request flag (GDB only).
784 ///
785 /// Set when kill_from_debugger() is called, cleared when VCPU stops running.
786 /// Used to distinguish debugger interrupts from normal kill() interrupts.
787 #[cfg(gdb)]
788 debug_interrupt: AtomicBool,
789
790 /// Whether the corresponding VM has been dropped.
791 dropped: AtomicBool,
792
793 /// Delay between retry attempts when sending signals to the VCPU thread.
794 retry_delay: Duration,
795
796 /// Offset from SIGRTMIN for the signal used to interrupt the VCPU thread.
797 sig_rt_min_offset: u8,
798}
799
800#[cfg(any(kvm, mshv3))]
801impl LinuxInterruptHandle {
802 fn send_signal(&self, stamp_generation: bool) -> bool {
803 let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
804 let mut sent_signal = false;
805 let mut target_generation: Option<u64> = None;
806
807 loop {
808 if !self.call_active.load(Ordering::Acquire) {
809 // No active call, so no need to send signal
810 break;
811 }
812
813 let (running, generation) = self.get_running_and_generation();
814
815 // Stamp generation into cancel_requested if requested and this is the first iteration
816 // We stamp even when running=false to support killing during host function calls
817 // The generation tracking will prevent stale kills from affecting new calls
818 // Only stamp if a call is actually active (call_active=true)
819 if stamp_generation
820 && target_generation.is_none()
821 && self.call_active.load(Ordering::Acquire)
822 {
823 self.set_cancel_requested(generation);
824 target_generation = Some(generation);
825 }
826
827 // If not running, we've stamped the generation (if requested), so we're done
828 // This handles the host function call scenario
829 if !running {
830 break;
831 }
832
833 match target_generation {
834 None => target_generation = Some(generation),
835 // prevent ABA problem
836 Some(expected) if expected != generation => break,
837 _ => {}
838 }
839
840 log::info!("Sending signal to kill vcpu thread...");
841 sent_signal = true;
842 unsafe {
843 libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
844 }
845 std::thread::sleep(self.retry_delay);
846 }
847
848 sent_signal
849 }
850}
851
852#[cfg(any(kvm, mshv3))]
853impl InterruptHandle for LinuxInterruptHandle {
854 fn kill(&self) -> bool {
855 if !(self.call_active.load(Ordering::Acquire)) {
856 // No active call, so no effect
857 return false;
858 }
859
860 // send_signal will stamp the generation into cancel_requested
861 // right before sending each signal, ensuring they're always in sync
862 self.send_signal(true)
863 }
864
865 #[cfg(gdb)]
866 fn kill_from_debugger(&self) -> bool {
867 self.debug_interrupt.store(true, Ordering::Relaxed);
868 self.send_signal(false)
869 }
870
871 fn dropped(&self) -> bool {
872 self.dropped.load(Ordering::Relaxed)
873 }
874}
875
876#[cfg(any(kvm, mshv3))]
877impl InterruptHandleInternal for LinuxInterruptHandle {
878 fn get_call_active(&self) -> &AtomicBool {
879 &self.call_active
880 }
881
882 fn get_running(&self) -> &AtomicU64 {
883 &self.running
884 }
885
886 fn get_cancel_requested(&self) -> &AtomicU64 {
887 &self.cancel_requested
888 }
889}
890
891#[cfg(all(test, any(target_os = "windows", kvm)))]
892pub(crate) mod tests {
893 use std::sync::{Arc, Mutex};
894
895 use hyperlight_testing::dummy_guest_as_string;
896
897 use crate::sandbox::uninitialized::GuestBinary;
898 #[cfg(any(crashdump, gdb))]
899 use crate::sandbox::uninitialized::SandboxRuntimeConfig;
900 use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
901 use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
902 use crate::{Result, is_hypervisor_present, new_error};
903
904 #[test]
905 fn test_initialise() -> Result<()> {
906 if !is_hypervisor_present() {
907 return Ok(());
908 }
909
910 use crate::mem::ptr::RawPtr;
911 use crate::sandbox::host_funcs::FunctionRegistry;
912
913 let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
914
915 let config: SandboxConfiguration = Default::default();
916 #[cfg(any(crashdump, gdb))]
917 let rt_cfg: SandboxRuntimeConfig = Default::default();
918 let sandbox =
919 UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
920 let (mem_mgr, mut gshm) = sandbox.mgr.build();
921 let mut vm = set_up_hypervisor_partition(
922 &mut gshm,
923 &config,
924 #[cfg(any(crashdump, gdb))]
925 &rt_cfg,
926 sandbox.load_info,
927 )?;
928
929 // Set up required parameters for initialise
930 let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
931 let seed = 12345u64; // Random seed
932 let page_size = 4096u32; // Standard page size
933 let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
934 let guest_max_log_level = Some(log::LevelFilter::Error);
935
936 #[cfg(gdb)]
937 let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
938
939 // Test the initialise method
940 vm.initialise(
941 peb_addr,
942 seed,
943 page_size,
944 mem_mgr,
945 host_funcs,
946 guest_max_log_level,
947 #[cfg(gdb)]
948 dbg_mem_access_fn,
949 )?;
950
951 Ok(())
952 }
953}