hyperlight_common/vmem.rs
1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15 */
16
17#[cfg_attr(target_arch = "x86", path = "arch/i686/vmem.rs")]
18#[cfg_attr(
19 all(target_arch = "x86_64", not(feature = "i686-guest")),
20 path = "arch/amd64/vmem.rs"
21)]
22#[cfg_attr(
23 all(target_arch = "x86_64", feature = "i686-guest"),
24 path = "arch/i686/vmem.rs"
25)]
26#[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/vmem.rs")]
27mod arch;
28
29#[cfg(all(
30 feature = "i686-guest",
31 not(any(target_arch = "x86", target_arch = "x86_64"))
32))]
33compile_error!(
34 "the `i686-guest` feature is only supported on `target_arch = \"x86\"` (guest) or \
35 `target_arch = \"x86_64\"` (host) targets"
36);
37
38/// This is always the page size that the /guest/ is being compiled
39/// for, which may or may not be the same as the host page size.
40pub use arch::PAGE_SIZE;
41pub use arch::{PAGE_PRESENT, PAGE_TABLE_SIZE, PTE_ADDR_MASK, PageTableEntry, PhysAddr, VirtAddr};
42pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize =
43 PAGE_TABLE_SIZE / core::mem::size_of::<PageTableEntry>();
44
45// Shared page table iterator infrastructure used by each arch module.
46
47/// Utility function to extract an (inclusive on both ends) bit range
48/// from a quadword.
49#[inline(always)]
50pub(in crate::vmem) fn bits<const HIGH_BIT: u8, const LOW_BIT: u8>(x: u64) -> u64 {
51 (x & ((1 << (HIGH_BIT + 1)) - 1)) >> LOW_BIT
52}
53
54/// Helper function to write a page table entry, updating the whole
55/// chain of tables back to the root if necessary.
56///
57/// # Safety
58/// Same requirements as [`TableOps::write_entry`].
59pub(in crate::vmem) unsafe fn write_entry_updating<
60 Op: TableOps,
61 P: UpdateParent<
62 Op,
63 TableMoveInfo = <Op::TableMovability as TableMovabilityBase<Op>>::TableMoveInfo,
64 >,
65>(
66 op: &Op,
67 parent: P,
68 addr: Op::TableAddr,
69 entry: u64,
70) {
71 #[allow(clippy::useless_conversion)]
72 if let Some(again) = unsafe { op.write_entry(addr, entry as PageTableEntry) } {
73 parent.update_parent(op, again);
74 }
75}
76
77/// A helper trait that allows us to move a page table (e.g. from the
78/// snapshot to the scratch region), keeping track of the context that
79/// needs to be updated when that is moved (and potentially
80/// recursively updating, if necessary).
81///
82/// This is done via a trait so that the selected impl knows the exact
83/// nesting depth of tables, in order to assist
84/// inlining/specialisation in generating efficient code.
85///
86/// The trait definition only bounds its parameter by
87/// [`TableReadOps`], since [`UpdateParentNone`] does not need to be
88/// able to actually write to the tables.
89pub trait UpdateParent<Op: TableReadOps + ?Sized>: Copy {
90 /// The type of the information about a moved table which is
91 /// needed in order to update its parent.
92 type TableMoveInfo;
93 /// The [`UpdateParent`] type that should be used when going down
94 /// another level in the table, in order to add the current level
95 /// to the chain of ancestors to be updated.
96 type ChildType: UpdateParent<Op, TableMoveInfo = Self::TableMoveInfo>;
97 fn update_parent(self, op: &Op, new_ptr: Self::TableMoveInfo);
98 fn for_child_at_entry(self, entry_ptr: Op::TableAddr) -> Self::ChildType;
99}
100
101/// A struct implementing [`UpdateParent`] that is impossible to use
102/// (since its [`UpdateParent::update_parent`] method takes [`Void`]),
103/// used when it is statically known that a table operation cannot
104/// result in a need to update ancestors.
105#[derive(Copy, Clone)]
106pub struct UpdateParentNone {}
107impl<Op: TableReadOps> UpdateParent<Op> for UpdateParentNone {
108 type TableMoveInfo = Void;
109 type ChildType = Self;
110 fn update_parent(self, _op: &Op, impossible: Void) {
111 match impossible {}
112 }
113 fn for_child_at_entry(self, _entry_ptr: Op::TableAddr) -> Self {
114 self
115 }
116}
117
118/// A helper structure indicating a mapping operation that needs to be
119/// performed.
120pub(in crate::vmem) struct MapRequest<Op: TableReadOps, P: UpdateParent<Op>> {
121 pub table_base: Op::TableAddr,
122 pub vmin: u64,
123 pub len: u64,
124 pub update_parent: P,
125}
126
127/// A helper structure indicating that a particular PTE needs to be
128/// modified.
129pub(in crate::vmem) struct MapResponse<Op: TableReadOps, P: UpdateParent<Op>> {
130 pub entry_ptr: Op::TableAddr,
131 pub vmin: u64,
132 pub len: u64,
133 pub update_parent: P,
134}
135
136/// Iterator that walks through page table entries at a specific level.
137///
138/// Given a virtual address range and a table base, this iterator yields
139/// `MapResponse` items for each page table entry that needs to be modified.
140/// The const generics `HIGH_BIT` and `LOW_BIT` specify which bits of the
141/// virtual address are used to index into this level's table.
142///
143/// For example on amd64:
144/// - PML4: HIGH_BIT=47, LOW_BIT=39 (9 bits = 512 entries, each covering 512GB)
145/// - PDPT: HIGH_BIT=38, LOW_BIT=30 (9 bits = 512 entries, each covering 1GB)
146/// - PD: HIGH_BIT=29, LOW_BIT=21 (9 bits = 512 entries, each covering 2MB)
147/// - PT: HIGH_BIT=20, LOW_BIT=12 (9 bits = 512 entries, each covering 4KB)
148///
149/// On i686:
150/// - PD: HIGH_BIT=31, LOW_BIT=22 (10 bits = 1024 entries, each covering 4MB)
151/// - PT: HIGH_BIT=21, LOW_BIT=12 (10 bits = 1024 entries, each covering 4KB)
152pub(in crate::vmem) struct ModifyPteIterator<
153 const HIGH_BIT: u8,
154 const LOW_BIT: u8,
155 Op: TableReadOps,
156 P: UpdateParent<Op>,
157> {
158 request: MapRequest<Op, P>,
159 n: u64,
160}
161impl<const HIGH_BIT: u8, const LOW_BIT: u8, Op: TableReadOps, P: UpdateParent<Op>> Iterator
162 for ModifyPteIterator<HIGH_BIT, LOW_BIT, Op, P>
163{
164 type Item = MapResponse<Op, P>;
165 fn next(&mut self) -> Option<Self::Item> {
166 // Each page table entry at this level covers a region of size
167 // (1 << LOW_BIT) bytes. For example, at the PT level
168 // (LOW_BIT=12), each entry covers 4KB (0x1000 bytes). At the
169 // PD level (LOW_BIT=21), each entry covers 2MB (0x200000
170 // bytes).
171 //
172 // This mask isolates the bits below this level's index bits,
173 // used for alignment.
174 let lower_bits_mask = (1u64 << LOW_BIT) - 1;
175
176 // Calculate the virtual address for this iteration.
177 // On the first iteration (n=0), start at the requested vmin.
178 // On subsequent iterations, advance to the next aligned boundary.
179 // This handles the case where vmin isn't aligned to this level's
180 // entry size.
181 let next_vmin = if self.n == 0 {
182 self.request.vmin
183 } else {
184 // Align to the next boundary by adding one entry's worth
185 // and masking off lower bits. Masking off before adding
186 // is safe, since n << LOW_BIT must always have zeros in
187 // these positions.
188 let aligned_min = self.request.vmin & !lower_bits_mask;
189 // Use checked_add because going past the end of the
190 // address space counts as "the next one would be out of
191 // range"
192 aligned_min.checked_add(self.n << LOW_BIT)?
193 };
194
195 // Check if we've processed the entire requested range
196 if next_vmin >= self.request.vmin + self.request.len {
197 return None;
198 }
199
200 // Calculate the pointer to this level's page table entry.
201 // bits::<HIGH_BIT, LOW_BIT> extracts the relevant index bits
202 // from the virtual address. Multiply by the PTE size to get
203 // the byte offset.
204 let pte_index = bits::<HIGH_BIT, LOW_BIT>(next_vmin);
205 let entry_ptr = Op::entry_addr(
206 self.request.table_base,
207 pte_index * core::mem::size_of::<PageTableEntry>() as u64,
208 );
209
210 // Calculate how many bytes remain to be mapped from this point.
211 let len_from_here = self.request.len - (next_vmin - self.request.vmin);
212 // Calculate the maximum bytes this single entry can cover.
213 // If next_vmin is aligned, this is the full entry size (1 << LOW_BIT).
214 // If not aligned (only possible on first iteration), it's the
215 // remaining space until the next boundary.
216 let max_len = (1u64 << LOW_BIT) - (next_vmin & lower_bits_mask);
217 // The actual length for this entry is the smaller of what's
218 // needed vs what fits.
219 let next_len = core::cmp::min(len_from_here, max_len);
220
221 // Advance iteration counter for next call
222 self.n += 1;
223
224 Some(MapResponse {
225 entry_ptr,
226 vmin: next_vmin,
227 len: next_len,
228 update_parent: self.request.update_parent,
229 })
230 }
231}
232
233pub(in crate::vmem) fn modify_ptes<
234 const HIGH_BIT: u8,
235 const LOW_BIT: u8,
236 Op: TableReadOps,
237 P: UpdateParent<Op>,
238>(
239 r: MapRequest<Op, P>,
240) -> ModifyPteIterator<HIGH_BIT, LOW_BIT, Op, P> {
241 ModifyPteIterator { request: r, n: 0 }
242}
243
244/// The read-only operations used to actually access the page table
245/// structures, used to allow the same code to be used in the host and
246/// the guest for page table setup. This is distinct from
247/// `TableWriteOps`, since there are some implementations for which
248/// writing does not make sense, and only reading is required.
249pub trait TableReadOps {
250 /// The type of table addresses
251 type TableAddr: Copy;
252
253 /// Offset the table address by the given offset in bytes.
254 ///
255 /// # Parameters
256 /// - `addr`: The base address of the table.
257 /// - `entry_offset`: The offset in **bytes** within the page table. This is
258 /// not an entry index; callers must multiply the entry index by the size
259 /// of a page table entry (typically 8 bytes) to obtain the correct byte offset.
260 ///
261 /// # Returns
262 /// The address of the entry at the given byte offset from the base address.
263 fn entry_addr(addr: Self::TableAddr, entry_offset: u64) -> Self::TableAddr;
264
265 /// Read a u64 from the given address, used to read existing page
266 /// table entries
267 ///
268 /// # Safety
269 /// This reads from the given memory address, and so all the usual
270 /// Rust things about raw pointers apply. This will also be used
271 /// to update guest page tables, so especially in the guest, it is
272 /// important to ensure that the page tables updates do not break
273 /// invariants. The implementor of the trait should ensure that
274 /// nothing else will be reading/writing the address at the same
275 /// time as mapping code using the trait.
276 unsafe fn read_entry(&self, addr: Self::TableAddr) -> PageTableEntry;
277
278 /// Convert an abstract table address to a concrete physical address (u64)
279 /// which can be e.g. written into a page table entry
280 fn to_phys(addr: Self::TableAddr) -> PhysAddr;
281
282 /// Convert a concrete physical address (u64) which may have been e.g. read
283 /// from a page table entry back into an abstract table address
284 fn from_phys(addr: PhysAddr) -> Self::TableAddr;
285
286 /// Return the address of the root page table
287 fn root_table(&self) -> Self::TableAddr;
288}
289
290/// Our own version of ! until it is stable. Used to avoid needing to
291/// implement [`TableOps::update_root`] for ops that never need
292/// to move a table.
293pub enum Void {}
294
295/// A marker struct, used by an implementation of [`TableOps`] to
296/// indicate that it may need to move existing page tables
297pub struct MayMoveTable {}
298/// A marker struct, used by an implementation of [`TableOps`] to
299/// indicate that it will be able to update existing page tables
300/// in-place, without moving them.
301pub struct MayNotMoveTable {}
302
303mod sealed {
304 use super::{MayMoveTable, MayNotMoveTable, TableReadOps, Void};
305
306 /// A (purposefully-not-exposed) internal implementation detail of the
307 /// logic around whether a [`TableOps`] implementation may or may not
308 /// move page tables.
309 pub trait TableMovabilityBase<Op: TableReadOps + ?Sized> {
310 type TableMoveInfo;
311 }
312 impl<Op: TableReadOps> TableMovabilityBase<Op> for MayMoveTable {
313 type TableMoveInfo = Op::TableAddr;
314 }
315 impl<Op: TableReadOps> TableMovabilityBase<Op> for MayNotMoveTable {
316 type TableMoveInfo = Void;
317 }
318}
319use sealed::*;
320
321/// A sealed trait used to collect some information about the marker structures [`MayMoveTable`] and [`MayNotMoveTable`]
322pub trait TableMovability<Op: TableReadOps + ?Sized>:
323 TableMovabilityBase<Op>
324 + arch::TableMovability<Op, <Self as TableMovabilityBase<Op>>::TableMoveInfo>
325{
326}
327impl<
328 Op: TableReadOps,
329 T: TableMovabilityBase<Op>
330 + arch::TableMovability<Op, <Self as TableMovabilityBase<Op>>::TableMoveInfo>,
331> TableMovability<Op> for T
332{
333}
334
335/// The operations used to actually access the page table structures
336/// that involve writing to them, used to allow the same code to be
337/// used in the host and the guest for page table setup.
338pub trait TableOps: TableReadOps {
339 /// This marker should be either [`MayMoveTable`] or
340 /// [`MayNotMoveTable`], as the case may be.
341 ///
342 /// If this is [`MayMoveTable`], the return type of
343 /// [`Self::write_entry`] and the parameter type of
344 /// [`Self::update_root`] will be `<Self as
345 /// TableReadOps>::TableAddr`. If it is [`MayNotMoveTable`], those
346 /// types will be [`Void`].
347 type TableMovability: TableMovability<Self>;
348
349 /// Allocate a zeroed table
350 ///
351 /// # Safety
352 /// The current implementations of this function are not
353 /// inherently unsafe, but the guest implementation will likely
354 /// become so in the future when a real physical page allocator is
355 /// implemented.
356 ///
357 /// Currently, callers should take care not to call this on
358 /// multiple threads at the same time.
359 ///
360 /// # Panics
361 /// This function may panic if:
362 /// - The Layout creation fails
363 /// - Memory allocation fails
364 unsafe fn alloc_table(&self) -> Self::TableAddr;
365
366 /// Write a u64 to the given address, used to write updated page
367 /// table entries. In some cases,the page table in which the entry
368 /// is located may need to be relocated in order for this to
369 /// succeed; if this is the case, the base address of the new
370 /// table is returned.
371 ///
372 /// # Safety
373 /// This writes to the given memory address, and so all the usual
374 /// Rust things about raw pointers apply. This will also be used
375 /// to update guest page tables, so especially in the guest, it is
376 /// important to ensure that the page tables updates do not break
377 /// invariants. The implementor of the trait should ensure that
378 /// nothing else will be reading/writing the address at the same
379 /// time as mapping code using the trait.
380 unsafe fn write_entry(
381 &self,
382 addr: Self::TableAddr,
383 entry: PageTableEntry,
384 ) -> Option<<Self::TableMovability as TableMovabilityBase<Self>>::TableMoveInfo>;
385
386 /// Change the root page table to one at a different address
387 ///
388 /// # Safety
389 /// This function will directly result in a change to virtual
390 /// memory translation, and so is inherently unsafe w.r.t. the
391 /// Rust memory model. All the caveats listed on [`map`] apply as
392 /// well.
393 unsafe fn update_root(
394 &self,
395 new_root: <Self::TableMovability as TableMovabilityBase<Self>>::TableMoveInfo,
396 );
397}
398
399#[derive(Debug, PartialEq, Clone, Copy)]
400pub struct BasicMapping {
401 pub readable: bool,
402 pub writable: bool,
403 pub executable: bool,
404}
405
406#[derive(Debug, PartialEq, Clone, Copy)]
407pub struct CowMapping {
408 pub readable: bool,
409 pub executable: bool,
410}
411
412#[derive(Debug, PartialEq, Clone, Copy)]
413pub enum MappingKind {
414 Unmapped,
415 Basic(BasicMapping),
416 Cow(CowMapping),
417 /* TODO: What useful things other than basic mappings actually
418 * require touching the tables? */
419}
420
421#[derive(Debug)]
422pub struct Mapping {
423 pub phys_base: u64,
424 pub virt_base: u64,
425 pub len: u64,
426 pub kind: MappingKind,
427 /// On architectures that support multiple privilege levels inside
428 /// the guest, whether the mapping is accessible to the
429 /// lower-privileged level (with the same permissions/behaviour as
430 /// the upper-privileged level, for now).
431 pub user_accessible: bool,
432}
433
434/// Assumption: all are page-aligned
435///
436/// # Safety
437/// This function modifies pages backing a virtual memory range which
438/// is inherently unsafe w.r.t. the Rust memory model.
439///
440/// When using this function, please note:
441/// - No locking is performed before touching page table data structures,
442/// as such do not use concurrently with any other page table operations
443/// - TLB invalidation is not performed, if previously-mapped ranges
444/// are being remapped, TLB invalidation may need to be performed
445/// afterwards.
446pub use arch::map;
447/// This function is presently used for reading the tracing data, also
448/// it is useful for debugging
449///
450/// # Safety
451/// This function traverses page table data structures, and should not
452/// be called concurrently with any other operations that modify the
453/// page table.
454pub use arch::virt_to_phys;
455
456//==================================================================================================
457// Multi-space (aliased page-table) walking
458//==================================================================================================
459
460/// Identifier for a virtual address space, used by the multi-space
461/// walker to describe which space "owns" a shared intermediate table.
462/// Implementations typically use the physical address of the root
463/// page table (which is unique per space).
464pub type SpaceId = u64;
465
466/// A reference from one address space to an intermediate page table
467/// that lives in a different space. Produced by [`walk_va_spaces`] when
468/// the walker encounters an intermediate table (at some `depth` below
469/// the root) whose physical address was already seen via an earlier
470/// root — i.e. the two spaces alias that sub-tree.
471///
472/// Semantics: the level-`depth` block in **our** space that contains
473/// VAs starting at `our_va` is aliased to the level-`depth` block in
474/// `space` that contains VAs starting at `their_va`. Everything below
475/// that sub-tree — PDEs, PTEs, leaf mappings — is shared wholesale.
476///
477/// `depth` is counted from the root:
478/// - `depth = 1` on i686: the shared thing is a leaf PT (the thing a
479/// PDE points to).
480/// - `depth = 1, 2, 3` on amd64: PDPT, PD, or PT respectively.
481#[derive(Debug, Clone, Copy)]
482pub struct SpaceReferenceMapping {
483 /// Depth from the root at which the alias starts (1-based).
484 pub depth: usize,
485 /// The "owning" space — the first root that visited this
486 /// intermediate PA during [`walk_va_spaces`].
487 pub space: SpaceId,
488 /// Start VA of the aliased sub-tree in OUR space.
489 pub our_va: u64,
490 /// Start VA of the aliased sub-tree in the owning space. Usually
491 /// equal to `our_va` (kernel mappings at the same VA across
492 /// processes) but the design permits different VAs.
493 pub their_va: u64,
494}
495
496/// Either a normal leaf mapping in the current space, or a reference
497/// to an intermediate table in another space. The compaction loop in
498/// the host snapshotting code treats these two cases differently:
499///
500/// - `ThisSpace(m)` is rebuilt like any other leaf mapping: the
501/// backing page is compacted into the new snapshot blob, the PTE is
502/// written, and intermediate tables are allocated on demand.
503/// - `AnotherSpace(r)` is rebuilt by *linking*: the entry in our
504/// rebuilt root at depth `r.depth - 1` for `r.our_va` is made to
505/// point at whatever table the owning space ended up with at
506/// `r.their_va`. See [`space_aware_map`].
507#[derive(Debug)]
508pub enum SpaceAwareMapping {
509 ThisSpace(Mapping),
510 AnotherSpace(SpaceReferenceMapping),
511}
512
513/// Counterpart of [`walk_va_spaces`]'s `AnotherSpace` entries on the
514/// write side: installs a link in `op`'s root PT tree at `ref_map.our_va`
515/// that points at whatever intermediate table the owning space ended
516/// up with at `ref_map.their_va` (in `built_roots[ref_map.space]`).
517///
518/// Callers must ensure that `built_roots` contains populated page
519/// tables for any other space referenced by the mapping.
520///
521/// # Safety
522/// Same invariants as [`map`]: the caller owns the concurrency story
523/// around the page tables being written, and must invalidate TLBs
524/// afterwards if they were live.
525pub use arch::space_aware_map;
526/// Walk multiple page-table roots together, emitting either a normal
527/// leaf mapping (`ThisSpace`) or a reference to an alias that was
528/// already seen via an earlier root (`AnotherSpace`).
529///
530/// The caller passes `roots` in their preferred order of primacy. The
531/// first root to visit a particular intermediate PA becomes the
532/// "owner" of that sub-table — subsequent roots that alias it receive
533/// `AnotherSpace` entries referencing the owner.
534///
535/// The returned `Vec` is ordered the same way `roots` was passed — so
536/// by construction the result is topologically sorted: every
537/// `AnotherSpace` reference points to a space that appears earlier in
538/// the list. This lets a rebuilder process roots in iteration order
539/// without a separate sort pass, and guarantees that the
540/// [`space_aware_map`] invariant is met.
541///
542/// # Safety
543/// Same invariants as [`virt_to_phys`]. Callers must ensure the page
544/// tables are not being mutated concurrently.
545pub use arch::walk_va_spaces;