hyperlight_host/hypervisor/mod.rs
1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18
19use crate::Result;
20use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters};
21use crate::mem::memory_region::MemoryRegion;
22
23/// HyperV-on-linux functionality
24#[cfg(mshv3)]
25pub(crate) mod hyperv_linux;
26#[cfg(target_os = "windows")]
27pub(crate) mod hyperv_windows;
28
29/// GDB debugging support
30#[cfg(gdb)]
31pub(crate) mod gdb;
32
33/// Abstracts over different hypervisor register representations
34pub(crate) mod regs;
35
36#[cfg(kvm)]
37/// Functionality to manipulate KVM-based virtual machines
38pub(crate) mod kvm;
39
40#[cfg(target_os = "windows")]
41/// Hyperlight Surrogate Process
42pub(crate) mod surrogate_process;
43#[cfg(target_os = "windows")]
44/// Hyperlight Surrogate Process
45pub(crate) mod surrogate_process_manager;
46/// Safe wrappers around windows types like `PSTR`
47#[cfg(target_os = "windows")]
48pub(crate) mod wrappers;
49
50#[cfg(crashdump)]
51pub(crate) mod crashdump;
52
53pub(crate) mod hyperlight_vm;
54
55use std::fmt::Debug;
56use std::str::FromStr;
57#[cfg(any(kvm, mshv3))]
58use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering};
59#[cfg(target_os = "windows")]
60use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
61#[cfg(any(kvm, mshv3))]
62use std::time::Duration;
63
64pub(crate) enum HyperlightExit {
65 /// The vCPU has exited due to a debug event (usually breakpoint)
66 #[cfg(gdb)]
67 Debug { dr6: u64, exception: u32 },
68 /// The vCPU has halted
69 Halt(),
70 /// The vCPU has issued a write to the given port with the given value
71 IoOut(u16, Vec<u8>),
72 /// The vCPU tried to read from the given (unmapped) addr
73 MmioRead(u64),
74 /// The vCPU tried to write to the given (unmapped) addr
75 MmioWrite(u64),
76 /// The vCPU execution has been cancelled
77 Cancelled(),
78 /// The vCPU has exited for a reason that is not handled by Hyperlight
79 Unknown(String),
80 /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
81 #[cfg_attr(
82 target_os = "windows",
83 expect(
84 dead_code,
85 reason = "Retry() is never constructed on Windows, but it is still matched on (which dead_code lint ignores)"
86 )
87 )]
88 Retry(),
89}
90
91/// Trait for single-vCPU VMs. Provides a common interface for basic VM operations.
92/// Abstracts over differences between KVM, MSHV and WHP implementations.
93pub(crate) trait Hypervisor: Debug + Send {
94 /// Map memory region into this VM
95 ///
96 /// # Safety
97 /// The caller must ensure that the memory region is valid and points to valid memory,
98 /// and lives long enough for the VM to use it.
99 /// The caller must ensure that the given u32 is not already mapped, otherwise previously mapped
100 /// memory regions may be overwritten.
101 /// The memory region must not overlap with an existing region, and depending on platform, must be aligned to page boundaries.
102 unsafe fn map_memory(&mut self, region: (u32, &MemoryRegion)) -> Result<()>;
103
104 /// Unmap memory region from this VM that has previously been mapped using `map_memory`.
105 fn unmap_memory(&mut self, region: (u32, &MemoryRegion)) -> Result<()>;
106
107 /// Runs the vCPU until it exits.
108 /// Note: this function should not emit any traces or spans as it is called after guest span is setup
109 fn run_vcpu(&mut self) -> Result<HyperlightExit>;
110
111 /// Get regs
112 #[allow(dead_code)]
113 fn regs(&self) -> Result<CommonRegisters>;
114 /// Set regs
115 fn set_regs(&self, regs: &CommonRegisters) -> Result<()>;
116 /// Get fpu regs
117 #[allow(dead_code)]
118 fn fpu(&self) -> Result<CommonFpu>;
119 /// Set fpu regs
120 fn set_fpu(&self, fpu: &CommonFpu) -> Result<()>;
121 /// Get special regs
122 #[allow(dead_code)]
123 fn sregs(&self) -> Result<CommonSpecialRegisters>;
124 /// Set special regs
125 fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> Result<()>;
126
127 /// xsave
128 #[cfg(crashdump)]
129 fn xsave(&self) -> Result<Vec<u8>>;
130
131 /// Get partition handle
132 #[cfg(target_os = "windows")]
133 fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE;
134
135 /// Mark that initial memory setup is complete. After this, map_memory will fail.
136 /// This is only needed on Windows where dynamic memory mapping is not yet supported.
137 #[cfg(target_os = "windows")]
138 fn complete_initial_memory_setup(&mut self);
139}
140
141/// Get the logging level to pass to the guest entrypoint
142fn get_max_log_level() -> u32 {
143 // Check to see if the RUST_LOG environment variable is set
144 // and if so, parse it to get the log_level for hyperlight_guest
145 // if that is not set get the log level for the hyperlight_host
146
147 // This is done as the guest will produce logs based on the log level returned here
148 // producing those logs is expensive and we don't want to do it if the host is not
149 // going to process them
150
151 let val = std::env::var("RUST_LOG").unwrap_or_default();
152
153 let level = if val.contains("hyperlight_guest") {
154 val.split(',')
155 .find(|s| s.contains("hyperlight_guest"))
156 .unwrap_or("")
157 .split('=')
158 .nth(1)
159 .unwrap_or("")
160 } else if val.contains("hyperlight_host") {
161 val.split(',')
162 .find(|s| s.contains("hyperlight_host"))
163 .unwrap_or("")
164 .split('=')
165 .nth(1)
166 .unwrap_or("")
167 } else {
168 // look for a value string that does not contain "="
169 val.split(',').find(|s| !s.contains("=")).unwrap_or("")
170 };
171
172 log::info!("Determined guest log level: {}", level);
173 // Convert the log level string to a LevelFilter
174 // If no value is found, default to Error
175 LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
176}
177
178/// A trait for platform-specific interrupt handle implementation details
179pub(crate) trait InterruptHandleImpl: InterruptHandle {
180 /// Set the thread ID for the vcpu thread
181 #[cfg(any(kvm, mshv3))]
182 fn set_tid(&self);
183
184 /// Set the running state
185 fn set_running(&self);
186
187 /// Clear the running state
188 fn clear_running(&self);
189
190 /// Mark the handle as dropped
191 fn set_dropped(&self);
192
193 /// Check if cancellation was requested
194 fn is_cancelled(&self) -> bool;
195
196 /// Clear the cancellation request flag
197 fn clear_cancel(&self);
198
199 /// Check if debug interrupt was requested (always returns false when gdb feature is disabled)
200 fn is_debug_interrupted(&self) -> bool;
201
202 // Clear the debug interrupt request flag
203 #[cfg(gdb)]
204 fn clear_debug_interrupt(&self);
205}
206
207/// A trait for handling interrupts to a sandbox's vcpu
208pub trait InterruptHandle: Send + Sync + Debug {
209 /// Interrupt the corresponding sandbox from running.
210 ///
211 /// - If this is called while the the sandbox currently executing a guest function call, it will interrupt the sandbox and return `true`.
212 /// - If this is called while the sandbox is not running (for example before or after calling a guest function), it will do nothing and return `false`.
213 ///
214 /// # Note
215 /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
216 fn kill(&self) -> bool;
217
218 /// Used by a debugger to interrupt the corresponding sandbox from running.
219 ///
220 /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
221 /// - If this is called while the vcpu is not running, (for example during a host call), the
222 /// vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
223 /// it's scheduled, and returns `false`.
224 ///
225 /// # Note
226 /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
227 #[cfg(gdb)]
228 fn kill_from_debugger(&self) -> bool;
229
230 /// Returns true if the corresponding sandbox has been dropped
231 fn dropped(&self) -> bool;
232}
233
234#[cfg(any(kvm, mshv3))]
235#[derive(Debug)]
236pub(super) struct LinuxInterruptHandle {
237 /// Atomic value packing vcpu execution state.
238 ///
239 /// Bit layout:
240 /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
241 /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
242 /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
243 ///
244 /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
245 /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
246 state: AtomicU8,
247
248 /// Thread ID where the vcpu is running.
249 ///
250 /// Note: Multiple VMs may have the same `tid` (same thread runs multiple sandboxes sequentially),
251 /// but at most one VM will have RUNNING_BIT set at any given time.
252 tid: AtomicU64,
253
254 /// Whether the corresponding VM has been dropped.
255 dropped: AtomicBool,
256
257 /// Delay between retry attempts when sending signals to interrupt the vcpu.
258 retry_delay: Duration,
259
260 /// Offset from SIGRTMIN for the signal used to interrupt the vcpu thread.
261 sig_rt_min_offset: u8,
262}
263
264#[cfg(any(kvm, mshv3))]
265impl LinuxInterruptHandle {
266 const RUNNING_BIT: u8 = 1 << 1;
267 const CANCEL_BIT: u8 = 1 << 0;
268 #[cfg(gdb)]
269 const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
270
271 /// Get the running, cancel and debug flags atomically.
272 ///
273 /// # Memory Ordering
274 /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running()` and `kill()`.
275 /// This ensures that when we observe running=true, we also see the correct `tid` value.
276 fn get_running_cancel_debug(&self) -> (bool, bool, bool) {
277 let state = self.state.load(Ordering::Acquire);
278 let running = state & Self::RUNNING_BIT != 0;
279 let cancel = state & Self::CANCEL_BIT != 0;
280 #[cfg(gdb)]
281 let debug = state & Self::DEBUG_INTERRUPT_BIT != 0;
282 #[cfg(not(gdb))]
283 let debug = false;
284 (running, cancel, debug)
285 }
286
287 fn send_signal(&self) -> bool {
288 let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
289 let mut sent_signal = false;
290
291 loop {
292 let (running, cancel, debug) = self.get_running_cancel_debug();
293
294 // Check if we should continue sending signals
295 // Exit if not running OR if neither cancel nor debug_interrupt is set
296 let should_continue = running && (cancel || debug);
297
298 if !should_continue {
299 break;
300 }
301
302 log::info!("Sending signal to kill vcpu thread...");
303 sent_signal = true;
304 // Acquire ordering to synchronize with the Release store in set_tid()
305 // This ensures we see the correct tid value for the currently running vcpu
306 unsafe {
307 libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
308 }
309 std::thread::sleep(self.retry_delay);
310 }
311
312 sent_signal
313 }
314}
315
316#[cfg(any(kvm, mshv3))]
317impl InterruptHandleImpl for LinuxInterruptHandle {
318 fn set_tid(&self) {
319 // Release ordering to synchronize with the Acquire load of `running` in send_signal()
320 // This ensures that when send_signal() observes RUNNING_BIT=true (via Acquire),
321 // it also sees the correct tid value stored here
322 self.tid
323 .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
324 }
325
326 fn set_running(&self) {
327 // Release ordering to ensure that the tid store (which uses Release)
328 // is visible to any thread that observes running=true via Acquire ordering.
329 // This prevents the interrupt thread from reading a stale tid value.
330 self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
331 }
332
333 fn is_cancelled(&self) -> bool {
334 // Acquire ordering to synchronize with the Release in kill()
335 // This ensures we see the cancel flag set by the interrupt thread
336 self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
337 }
338
339 fn clear_cancel(&self) {
340 // Release ordering to ensure that any operations from the previous run()
341 // are visible to other threads. While this is typically called by the vcpu thread
342 // at the start of run(), the VM itself can move between threads across guest calls.
343 self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
344 }
345
346 fn clear_running(&self) {
347 // Release ordering to ensure all vcpu operations are visible before clearing running
348 self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
349 }
350
351 fn is_debug_interrupted(&self) -> bool {
352 #[cfg(gdb)]
353 {
354 self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
355 }
356 #[cfg(not(gdb))]
357 {
358 false
359 }
360 }
361
362 #[cfg(gdb)]
363 fn clear_debug_interrupt(&self) {
364 self.state
365 .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
366 }
367
368 fn set_dropped(&self) {
369 // Release ordering to ensure all VM cleanup operations are visible
370 // to any thread that checks dropped() via Acquire
371 self.dropped.store(true, Ordering::Release);
372 }
373}
374
375#[cfg(any(kvm, mshv3))]
376impl InterruptHandle for LinuxInterruptHandle {
377 fn kill(&self) -> bool {
378 // Release ordering ensures that any writes before kill() are visible to the vcpu thread
379 // when it checks is_cancelled() with Acquire ordering
380 self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
381
382 // Send signals to interrupt the vcpu if it's currently running
383 self.send_signal()
384 }
385
386 #[cfg(gdb)]
387 fn kill_from_debugger(&self) -> bool {
388 self.state
389 .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
390 self.send_signal()
391 }
392 fn dropped(&self) -> bool {
393 // Acquire ordering to synchronize with the Release in set_dropped()
394 // This ensures we see all VM cleanup operations that happened before drop
395 self.dropped.load(Ordering::Acquire)
396 }
397}
398
399#[cfg(target_os = "windows")]
400#[derive(Debug)]
401pub(super) struct WindowsInterruptHandle {
402 /// Atomic value packing vcpu execution state.
403 ///
404 /// Bit layout:
405 /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
406 /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
407 /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
408 ///
409 /// `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running,
410 /// which is why we need the RUNNING_BIT.
411 ///
412 /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
413 /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
414 state: AtomicU8,
415
416 partition_handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
417 dropped: AtomicBool,
418}
419
420#[cfg(target_os = "windows")]
421impl WindowsInterruptHandle {
422 const RUNNING_BIT: u8 = 1 << 1;
423 const CANCEL_BIT: u8 = 1 << 0;
424 #[cfg(gdb)]
425 const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
426}
427
428#[cfg(target_os = "windows")]
429impl InterruptHandleImpl for WindowsInterruptHandle {
430 fn set_running(&self) {
431 // Release ordering to ensure prior memory operations are visible when another thread observes running=true
432 self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
433 }
434
435 fn is_cancelled(&self) -> bool {
436 // Acquire ordering to synchronize with the Release in kill()
437 // This ensures we see the CANCEL_BIT set by the interrupt thread
438 self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
439 }
440
441 fn clear_cancel(&self) {
442 // Release ordering to ensure that any operations from the previous run()
443 // are visible to other threads. While this is typically called by the vcpu thread
444 // at the start of run(), the VM itself can move between threads across guest calls.
445 self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
446 }
447
448 fn clear_running(&self) {
449 // Release ordering to ensure all vcpu operations are visible before clearing running
450 self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
451 }
452
453 fn is_debug_interrupted(&self) -> bool {
454 #[cfg(gdb)]
455 {
456 self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
457 }
458 #[cfg(not(gdb))]
459 {
460 false
461 }
462 }
463
464 #[cfg(gdb)]
465 fn clear_debug_interrupt(&self) {
466 self.state
467 .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
468 }
469
470 fn set_dropped(&self) {
471 // Release ordering to ensure all VM cleanup operations are visible
472 // to any thread that checks dropped() via Acquire
473 self.dropped.store(true, Ordering::Release);
474 }
475}
476
477#[cfg(target_os = "windows")]
478impl InterruptHandle for WindowsInterruptHandle {
479 fn kill(&self) -> bool {
480 use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
481
482 // Release ordering ensures that any writes before kill() are visible to the vcpu thread
483 // when it checks is_cancelled() with Acquire ordering
484 self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
485
486 // Acquire ordering to synchronize with the Release in set_running()
487 // This ensures we see the running state set by the vcpu thread
488 let state = self.state.load(Ordering::Acquire);
489 if state & Self::RUNNING_BIT != 0 {
490 unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
491 } else {
492 false
493 }
494 }
495 #[cfg(gdb)]
496 fn kill_from_debugger(&self) -> bool {
497 use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
498
499 self.state
500 .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
501 // Acquire ordering to synchronize with the Release in set_running()
502 let state = self.state.load(Ordering::Acquire);
503 if state & Self::RUNNING_BIT != 0 {
504 unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
505 } else {
506 false
507 }
508 }
509
510 fn dropped(&self) -> bool {
511 // Acquire ordering to synchronize with the Release in set_dropped()
512 // This ensures we see all VM cleanup operations that happened before drop
513 self.dropped.load(Ordering::Acquire)
514 }
515}
516
517#[cfg(all(test, any(target_os = "windows", kvm)))]
518pub(crate) mod tests {
519 use std::sync::{Arc, Mutex};
520
521 use hyperlight_testing::dummy_guest_as_string;
522
523 use crate::sandbox::uninitialized::GuestBinary;
524 #[cfg(any(crashdump, gdb))]
525 use crate::sandbox::uninitialized::SandboxRuntimeConfig;
526 use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
527 use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
528 use crate::{Result, is_hypervisor_present, new_error};
529
530 #[test]
531 fn test_initialise() -> Result<()> {
532 if !is_hypervisor_present() {
533 return Ok(());
534 }
535
536 use crate::mem::ptr::RawPtr;
537 use crate::sandbox::host_funcs::FunctionRegistry;
538
539 let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
540
541 let config: SandboxConfiguration = Default::default();
542 #[cfg(any(crashdump, gdb))]
543 let rt_cfg: SandboxRuntimeConfig = Default::default();
544 let sandbox =
545 UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
546 let (mut mem_mgr, mut gshm) = sandbox.mgr.build();
547 let mut vm = set_up_hypervisor_partition(
548 &mut gshm,
549 &config,
550 #[cfg(any(crashdump, gdb))]
551 &rt_cfg,
552 sandbox.load_info,
553 )?;
554
555 // Set up required parameters for initialise
556 let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
557 let seed = 12345u64; // Random seed
558 let page_size = 4096u32; // Standard page size
559 let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
560 let guest_max_log_level = Some(log::LevelFilter::Error);
561
562 #[cfg(gdb)]
563 let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
564
565 // Test the initialise method
566 vm.initialise(
567 peb_addr,
568 seed,
569 page_size,
570 &mut mem_mgr,
571 &host_funcs,
572 guest_max_log_level,
573 #[cfg(gdb)]
574 dbg_mem_access_fn,
575 )?;
576
577 Ok(())
578 }
579}