spacetimedb_table/
page.rs

1//! Provides a [`Page`] abstraction that stores rows
2//! and an associated header necessary for the page to work.
3//! Consult the documentation of this type for a list of operations
4//! and a description of how page work.
5//!
6//! A page can provide a split mutable view of its fixed section and its variable section.
7//! This is provided through [`Page::split_fixed_var_mut`] with view operations
8//! defined on [`FixedView`] and [`VarView`].
9//!
10//! [ralfj_safe_valid]: https://www.ralfj.de/blog/2018/08/22/two-kinds-of-invariants.html
11//!
12//! Technical terms:
13//!
14//! - `valid` refers to, when referring to a type, granule, or row,
15//!   depending on the context, a memory location that holds a *safe* object.
16//!   When "valid for writes" is used, the location must be properly aligned
17//!   and none of its bytes may be uninit,
18//!   but the value need not be valid at the type in question.
19//!   "Valid for writes" is equivalent to valid-unconstrained.
20//!
21//! - `valid-unconstrained`, when referring to a memory location with a given type,
22//!   that the location stores a byte pattern which Rust/LLVM's memory model recognizes as valid,
23//!   and therefore must not contain any uninit,
24//!   but the value is not required to be logically meaningful,
25//!   and no code may depend on the data within it to uphold any invariants.
26//!   E.g. an unallocated [`VarLenGranule`] within a page stores valid-unconstrained bytes,
27//!   because the bytes are either 0 from the initial [`alloc_zeroed`] of the page,
28//!   or contain stale data from a previously freed [`VarLenGranule`].
29//!
30//! - `unused` means that it is safe to overwrite a block of memory without cleaning up its previous value.
31//!
32//!   See the post [Two Kinds of Invariants: Safety and Validity][ralf_safe_valid]
33//!   for a discussion on safety and validity invariants.
34
35use super::{
36    blob_store::BlobStore,
37    fixed_bit_set::FixedBitSet,
38    indexes::{Byte, Bytes, PageOffset, PAGE_HEADER_SIZE, PAGE_SIZE},
39    var_len::{is_granule_offset_aligned, VarLenGranule, VarLenGranuleHeader, VarLenMembers, VarLenRef},
40};
41use crate::{fixed_bit_set::IterSet, indexes::max_rows_in_page, static_assert_size, table::BlobNumBytes};
42use core::{mem, ops::ControlFlow};
43use spacetimedb_sats::layout::{Size, MIN_ROW_SIZE};
44use spacetimedb_sats::memory_usage::MemoryUsage;
45use spacetimedb_sats::{de::Deserialize, ser::Serialize};
46use thiserror::Error;
47
48#[derive(Error, Debug, PartialEq, Eq)]
49pub enum Error {
50    #[error("Want to allocate a var-len object of {need} granules, but have only {have} granules available")]
51    InsufficientVarLenSpace { need: u16, have: u16 },
52    #[error("Want to allocate a fixed-len row of {} bytes, but the page is full", need.len())]
53    InsufficientFixedLenSpace { need: Size },
54}
55
56/// A cons-cell in a freelist either
57/// for an unused fixed-len cell or a variable-length granule.
58#[repr(C)] // Required for a stable ABI.
59#[derive(Clone, Copy, Debug, PartialEq, Eq, bytemuck::NoUninit, Serialize, Deserialize)]
60struct FreeCellRef {
61    /// The address of the next free cell in a freelist.
62    ///
63    /// The `PageOffset::PAGE_END` is used as a sentinel to signal "`None`".
64    next: PageOffset,
65}
66
67impl MemoryUsage for FreeCellRef {
68    fn heap_usage(&self) -> usize {
69        let Self { next } = self;
70        next.heap_usage()
71    }
72}
73
74impl FreeCellRef {
75    /// The sentinel for NULL cell references.
76    const NIL: Self = Self {
77        next: PageOffset::PAGE_END,
78    };
79
80    /// Replaces the cell reference with `offset`, returning the existing one.
81    #[inline]
82    fn replace(&mut self, offset: PageOffset) -> FreeCellRef {
83        let next = mem::replace(&mut self.next, offset);
84        Self { next }
85    }
86
87    /// Returns whether the cell reference is non-empty.
88    #[inline]
89    const fn has(&self) -> bool {
90        !self.next.is_at_end()
91    }
92
93    /// Take the first free cell in the freelist starting with `self`, if any,
94    /// and promote the second free cell as the freelist head.
95    ///
96    /// # Safety
97    ///
98    /// When `self.has()`, it must point to a valid `FreeCellRef`.
99    #[inline]
100    unsafe fn take_freelist_head(
101        self: &mut FreeCellRef,
102        row_data: &Bytes,
103        adjust_free: impl FnOnce(PageOffset) -> PageOffset,
104    ) -> Option<PageOffset> {
105        self.has().then(|| {
106            let head = adjust_free(self.next);
107            // SAFETY: `self.next` so `head` points to a valid `FreeCellRef`.
108            let next = unsafe { get_ref(row_data, head) };
109            self.replace(*next).next
110        })
111    }
112
113    /// Prepend `new_head` to the freelist starting with `self`.
114    ///
115    /// SAFETY: `new_head`, after adjustment, must be in bounds of `row_data`.
116    /// Moreover, it must be valid for writing a `FreeCellRef` to it,
117    /// which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
118    /// Additionally, `self` must contain a valid `FreeCellRef`.
119    #[inline]
120    unsafe fn prepend_freelist(
121        self: &mut FreeCellRef,
122        row_data: &mut Bytes,
123        new_head: PageOffset,
124        adjust_free: impl FnOnce(PageOffset) -> PageOffset,
125    ) {
126        let next = self.replace(new_head);
127        let new_head = adjust_free(new_head);
128        // SAFETY: Per caller contract, `new_head` is in bounds of `row_data`.
129        // Moreover, `new_head` points to an unused `FreeCellRef`, so we can write to it.
130        let next_slot: &mut FreeCellRef = unsafe { get_mut(row_data, new_head) };
131        *next_slot = next;
132    }
133}
134
135/// All the fixed size header information.
136#[repr(C)] // Required for a stable ABI.
137#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
138struct FixedHeader {
139    /// A pointer to the head of the freelist which stores
140    /// all the unused (freed) fixed row cells.
141    /// These cells can be reused when inserting a new row.
142    next_free: FreeCellRef,
143
144    /// High water mark (HWM) for fixed-length rows.
145    /// Points one past the last-allocated (highest-indexed) fixed-length row,
146    /// so to allocate a fixed-length row from the gap,
147    /// post-increment this index.
148    // TODO(perf,future-work): determine how to lower the high water mark when freeing the topmost row.
149    last: PageOffset,
150
151    /// The number of rows currently in the page.
152    ///
153    /// N.B. this is not the same as `self.present_rows.len()`
154    /// as that counts both zero and one bits.
155    num_rows: u16,
156
157    // TODO(stable-module-abi): should this be inlined into the page?
158    /// For each fixed-length row slot, true if a row is stored there,
159    /// false if the slot is unallocated.
160    ///
161    /// Unallocated row slots store valid-unconstrained bytes, i.e. are never uninit.
162    present_rows: FixedBitSet,
163}
164
165impl MemoryUsage for FixedHeader {
166    fn heap_usage(&self) -> usize {
167        let Self {
168            next_free,
169            last,
170            num_rows,
171            present_rows,
172        } = self;
173        next_free.heap_usage() + last.heap_usage() + num_rows.heap_usage() + present_rows.heap_usage()
174    }
175}
176
177static_assert_size!(FixedHeader, 16);
178
179impl FixedHeader {
180    /// Returns a new `FixedHeader`
181    /// using the provided `max_rows_in_page` to decide how many rows `present_rows` can represent.
182    #[inline]
183    fn new(max_rows_in_page: usize) -> Self {
184        Self {
185            next_free: FreeCellRef::NIL,
186            // Points one after the last allocated fixed-length row, or `NULL` for an empty page.
187            last: PageOffset::VAR_LEN_NULL,
188            num_rows: 0,
189            present_rows: FixedBitSet::new(max_rows_in_page),
190        }
191    }
192
193    /// Set the (fixed) row starting at `offset`
194    /// and lasting `fixed_row_size` as `present`.
195    #[inline]
196    fn set_row_present(&mut self, offset: PageOffset, fixed_row_size: Size) {
197        self.set_row_presence(offset, fixed_row_size, true);
198        self.num_rows += 1;
199    }
200
201    /// Sets whether the (fixed) row starting at `offset`
202    /// and lasting `fixed_row_size` is `present` or not.
203    #[inline]
204    fn set_row_presence(&mut self, offset: PageOffset, fixed_row_size: Size, present: bool) {
205        self.present_rows.set(offset / fixed_row_size, present);
206    }
207
208    /// Returns whether the (fixed) row starting at `offset`
209    /// and lasting `fixed_row_size` is present or not.
210    #[inline]
211    fn is_row_present(&self, offset: PageOffset, fixed_row_size: Size) -> bool {
212        self.present_rows.get(offset / fixed_row_size)
213    }
214
215    /// Resets the header information to its state
216    /// when it was first created in [`FixedHeader::new`]
217    /// but with `max_rows_in_page` instead of the value passed on creation.
218    fn reset_for(&mut self, max_rows_in_page: usize) {
219        self.next_free = FreeCellRef::NIL;
220        self.last = PageOffset::VAR_LEN_NULL;
221        self.num_rows = 0;
222        self.present_rows.reset_for(max_rows_in_page);
223    }
224
225    /// Resets the header information to its state
226    /// when it was first created in [`FixedHeader::new`].
227    ///
228    /// The header is only good for the original row size.
229    #[inline]
230    fn clear(&mut self) {
231        self.next_free = FreeCellRef::NIL;
232        self.last = PageOffset::VAR_LEN_NULL;
233        self.num_rows = 0;
234        self.present_rows.clear();
235    }
236}
237
238/// All the var-len header information.
239#[repr(C)] // Required for a stable ABI.
240#[derive(bytemuck::NoUninit, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
241struct VarHeader {
242    /// A pointer to the head of the freelist which stores
243    /// all the unused (freed) var-len granules.
244    /// These cells can be reused when inserting a new row.
245    next_free: FreeCellRef,
246
247    /// The length of the freelist with its head referred to by `next_free`.
248    /// Stored in units of var-len nodes.
249    ///
250    /// This field is redundant,
251    /// as it can be recovered by traversing `next_free`.
252    /// However, traversing this linked-list is not cache friendly,
253    /// so we memoize the computation here.
254    freelist_len: u16,
255
256    /// High water mark (HWM) for var-len granules.
257    /// Points to the last-allocated (lowest-indexed) var-len granule,
258    /// so to allocate a var-len granule from the gap,
259    /// pre-decrement this index.
260    // TODO(perf,future-work): determine how to "lower" the high water mark when freeing the "top"-most granule.
261    first: PageOffset,
262
263    /// The number of granules currently used by rows within this page.
264    ///
265    /// [`Page::bytes_used_by_rows`] needs this information.
266    /// Stored here because otherwise counting it would require traversing all the present rows.
267    num_granules: u16,
268}
269
270impl MemoryUsage for VarHeader {
271    fn heap_usage(&self) -> usize {
272        let Self {
273            next_free,
274            freelist_len,
275            first,
276            num_granules,
277        } = self;
278        next_free.heap_usage() + freelist_len.heap_usage() + first.heap_usage() + num_granules.heap_usage()
279    }
280}
281
282static_assert_size!(VarHeader, 8);
283
284impl Default for VarHeader {
285    fn default() -> Self {
286        Self {
287            next_free: FreeCellRef::NIL,
288            freelist_len: 0,
289            first: PageOffset::PAGE_END,
290            num_granules: 0,
291        }
292    }
293}
294
295impl VarHeader {
296    /// Resets the header information to its state
297    /// when it was first created in [`VarHeader::default`].
298    fn clear(&mut self) {
299        *self = Self::default();
300    }
301}
302
303/// The metadata / header of a page that is necessary
304/// for modifying and interpreting the `row_data`.
305///
306/// This header info is split into a header for the fixed part
307/// and one for the variable part.
308/// The header is stored in the same heap allocation as the `row_data`
309/// as the whole [`Page`] is `Box`ed.
310#[repr(C)] // Required for a stable ABI.
311#[repr(align(64))] // Alignment must be same as `VarLenGranule::SIZE`.
312#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
313pub(super) struct PageHeader {
314    /// The header data relating to the fixed component of a row.
315    fixed: FixedHeader,
316    /// The header data relating to the var-len component of a row.
317    var: VarHeader,
318    /// The content-addressed hash of the page on disk,
319    /// if unmodified since the last snapshot,
320    /// and `None` otherwise.
321    ///
322    /// This means that modifications to the page always sets this field to `None`.
323    unmodified_hash: Option<blake3::Hash>,
324}
325
326impl MemoryUsage for PageHeader {
327    fn heap_usage(&self) -> usize {
328        let Self {
329            fixed,
330            var,
331            // MEMUSE: no allocation, ok to ignore
332            unmodified_hash: _,
333        } = self;
334        fixed.heap_usage() + var.heap_usage()
335    }
336}
337
338static_assert_size!(PageHeader, PAGE_HEADER_SIZE);
339
340impl PageHeader {
341    /// Returns a new `PageHeader` proper a [`Page`] for holding at most `max_rows_in_page` rows.
342    fn new(max_rows_in_page: usize) -> Self {
343        Self {
344            fixed: FixedHeader::new(max_rows_in_page),
345            var: VarHeader::default(),
346            unmodified_hash: None,
347        }
348    }
349
350    /// Resets the header information to its state
351    /// when it was first created in [`PageHeader::new`]
352    /// but with `max_rows_in_page` instead of the value passed on creation.
353    fn reset_for(&mut self, max_rows_in_page: usize) {
354        self.fixed.reset_for(max_rows_in_page);
355        self.var.clear();
356        self.unmodified_hash = None;
357    }
358
359    /// Resets the header information to its state
360    /// when it was first created in [`PageHeader::new`].
361    ///
362    /// The header is only good for the original row size.
363    fn clear(&mut self) {
364        self.fixed.clear();
365        self.var.clear();
366        self.unmodified_hash = None;
367    }
368
369    /// Returns the maximum number of rows the page can hold.
370    ///
371    /// Note that this number can be bigger
372    /// than the value provided in [`Self::new`] due to rounding up.
373    pub(super) fn max_rows_in_page(&self) -> usize {
374        self.fixed.present_rows.bits()
375    }
376
377    /// Returns a pointer to the `present_rows` bitset.
378    /// This is exposed for testing only.
379    #[cfg(test)]
380    pub(super) fn present_rows_storage_ptr_for_test(&self) -> *const () {
381        self.fixed.present_rows.storage().as_ptr().cast()
382    }
383}
384
385/// Fixed-length row portions must be at least large enough to store a `FreeCellRef`.
386const _MIN_ROW_SIZE_CAN_STORE_FCR: () = assert!(MIN_ROW_SIZE.len() >= mem::size_of::<FreeCellRef>());
387
388/// [`VarLenGranule`]s must be at least large enough to store a [`FreeCellRef`].
389const _VLG_CAN_STORE_FCR: () = assert!(VarLenGranule::SIZE.len() >= MIN_ROW_SIZE.len());
390
391/// Pointers properly aligned for a [`VarLenGranule`] must be properly aligned for [`FreeCellRef`].
392/// This is the case as the former's alignment is a multiple of the latter's alignment.
393const _VLG_ALIGN_MULTIPLE_OF_FCR: () = assert!(mem::align_of::<VarLenGranule>() % mem::align_of::<FreeCellRef>() == 0);
394
395/// The actual row data of a [`Page`].
396type RowData = [Byte; PageOffset::PAGE_END.idx()];
397
398/// A page of row data with an associated `header` and the raw `row_data` itself.
399///
400/// As a rough summary, the strategy employed by this page is:
401///
402/// - The fixed-len parts of rows grows left-to-right
403///   and starts from the beginning of the `row_data`
404///   until its high water mark (fixed HWM), i.e., `self.header.fixed.last`.
405///
406/// - The var-len parts of rows grows right-to-left
407///   and starts from the end of the `row_data`
408///   until its high water mark (variable HWM), i.e., `self.header.var.first`.
409///
410///   Each var-len object is stored in terms of a linked-list of chunks.
411///   Each chunk in this case is a [`VarLenGranule`] taking up 64 bytes where:
412///   - 6 bits = length, 10 bits = next-cell-pointer
413///   - 62 bytes = the bytes of the object
414///
415/// - As new rows are added, the HWMs move appropriately.
416///   When the fixed and variable HWMs meet, the page is full.
417///
418/// - When rows are freed, a freelist strategy is used both for
419///   the fixed parts and each `VarLenGranule`.
420///   These freelists are then used first before using space from the gap.
421///   The head of these freelists are stored in `next_free`
422///   in the fixed and variable headers respectively.
423///
424/// - As the fixed parts of rows may store pointers into the var-length section,
425///   to ensure that these pointers aren't dangling,
426///   the page uses pointer fixups when adding to, deleting from, and copying the page.
427///   These fixups are handled by having callers provide `VarLenMembers`
428///   to find the var-len reference slots in the fixed parts.
429#[repr(C)]
430// ^-- Required for a stable ABI.
431#[repr(align(64))]
432// ^-- Must have align at least that of `VarLenGranule`,
433// so that `row_data[PageOffset::PAGE_END - VarLenGranule::SIZE]` is an aligned pointer to `VarLenGranule`.
434// TODO(bikeshedding): consider raising the alignment. We may want this to be OS page (4096) aligned.
435#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
436pub struct Page {
437    /// The header containing metadata on how to interpret and modify the `row_data`.
438    header: PageHeader,
439    /// The actual bytes stored in the page.
440    /// This contains row data, fixed and variable, and freelists.
441    row_data: RowData,
442}
443
444impl MemoryUsage for Page {
445    fn heap_usage(&self) -> usize {
446        self.header.heap_usage()
447    }
448}
449
450static_assert_size!(Page, PAGE_SIZE);
451
452/// A mutable view of the fixed-len section of a [`Page`].
453pub struct FixedView<'page> {
454    /// A mutable view of the fixed-len bytes.
455    fixed_row_data: &'page mut Bytes,
456    /// A mutable view of the fixed header.
457    header: &'page mut FixedHeader,
458}
459
460impl FixedView<'_> {
461    /// Returns a mutable view of the row from `start` lasting `fixed_row_size` number of bytes.
462    ///
463    /// This method is safe, but callers should take care that `start` and `fixed_row_size`
464    /// are correct for this page, and that `start` is aligned.
465    /// Callers should further ensure that mutations to the row leave the row bytes
466    /// in an expected state, i.e. initialized where required by the row type,
467    /// and with `VarLenRef`s that point to valid granules and with correct lengths.
468    pub fn get_row_mut(&mut self, start: PageOffset, fixed_row_size: Size) -> &mut Bytes {
469        &mut self.fixed_row_data[start.range(fixed_row_size)]
470    }
471
472    /// Returns a shared view of the row from `start` lasting `fixed_row_size` number of bytes.
473    fn get_row(&mut self, start: PageOffset, fixed_row_size: Size) -> &Bytes {
474        &self.fixed_row_data[start.range(fixed_row_size)]
475    }
476
477    /// Frees the row starting at `row_offset` and lasting `fixed_row_size` bytes.
478    ///
479    /// # Safety
480    ///
481    /// `range_move(0..fixed_row_size, row_offset)` must be in bounds of `row_data`.
482    /// Moreover, it must be valid for writing a `FreeCellRef` to it,
483    /// which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
484    pub unsafe fn free(&mut self, row_offset: PageOffset, fixed_row_size: Size) {
485        // TODO(perf,future-work): if `row` is at the HWM, return it to the gap.
486
487        // SAFETY: Per caller contract, `row_offset` must be in bounds of `row_data`.
488        // Moreover, it must be valid for writing a `FreeCellRef` to it,
489        // which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
490        // We also know that `self.header.next_free` contains a valid `FreeCellRef`.
491        unsafe {
492            self.header
493                .next_free
494                .prepend_freelist(self.fixed_row_data, row_offset, |x| x)
495        };
496        self.header.num_rows -= 1;
497        self.header.set_row_presence(row_offset, fixed_row_size, false);
498    }
499}
500
501/// A mutable view of the var-len section of a [`Page`].
502pub struct VarView<'page> {
503    /// A mutable view of the var-len bytes.
504    var_row_data: &'page mut Bytes,
505    /// A mutable view of the var-len header.
506    header: &'page mut VarHeader,
507    /// One past the end of the fixed-len section of the page.
508    last_fixed: PageOffset,
509}
510
511impl<'page> VarView<'page> {
512    /// Returns the number of granules required to store the data,
513    /// whether the page has enough space,
514    /// and whether the object needs to go in the blob store.
515    ///
516    /// If the third value is `true`, i.e., the object will go in the blob store,
517    /// the first value will always be `1`.
518    fn has_enough_space_for(&self, len_in_bytes: usize) -> (usize, bool, bool) {
519        let (num_granules_req, in_blob) = VarLenGranule::bytes_to_granules(len_in_bytes);
520        let enough_space = num_granules_req <= self.num_granules_available();
521        (num_granules_req, enough_space, in_blob)
522    }
523
524    /// Returns the number of granules available for allocation.
525    fn num_granules_available(&self) -> usize {
526        self.header.freelist_len as usize
527            + VarLenGranule::space_to_granules(gap_remaining_size(self.header.first, self.last_fixed))
528    }
529
530    /// Provides an adjuster of offset in terms of `Page::row_data`
531    /// to work in terms of `VarView::var_row_data`.
532    ///
533    /// This has to be done due to `page.row_data.split_at_mut(last_fixed)`.
534    #[inline(always)]
535    fn adjuster(&self) -> impl FnOnce(PageOffset) -> PageOffset {
536        let lf = self.last_fixed;
537        move |offset| offset - lf
538    }
539
540    /// Allocates a linked-list of granules, in the var-len storage of the page,
541    /// for a var-len object of `obj_len` bytes.
542    ///
543    /// Returns a [`VarLenRef`] pointing to the head of that list,
544    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
545    /// and the object must be inserted into the large-blob store.
546    ///
547    /// The length of each granule is set, but no data is written to any granule.
548    /// Thus, the caller must proceed to write data to each granule for the claimed lengths.
549    ///
550    /// # Safety post-requirements
551    ///
552    /// The following are the safety *post-requirements* of calling this method.
553    /// That is, this method is safe to call,
554    /// but may leave the page in an inconsistent state
555    /// which must be rectified before other **unsafe methods** may be called.
556    ///
557    /// 1. When the returned `in_blob` holds, caller must ensure that,
558    ///    before the granule's data is read from / assumed to be initialized,
559    ///    the granule pointed to by the returned `vlr.first_granule`
560    ///    has an initialized header and a data section initialized to at least
561    ///    as many bytes as claimed by the header.
562    ///
563    /// 2. The caller must initialize each granule with data for the claimed length
564    ///    of the granule's data.
565    pub fn alloc_for_len(&mut self, obj_len: usize) -> Result<(VarLenRef, bool), Error> {
566        // Safety post-requirements of `alloc_for_obj_common`:
567        // 1. caller promised they will be satisfied.
568        // 2a. already satisfied as the closure below returns all the summands of `obj_len`.
569        // 2b. caller promised in 2. that they will satisfy this.
570        self.alloc_for_obj_common(obj_len, |req_granules| {
571            let rem = obj_len % VarLenGranule::DATA_SIZE;
572            (0..req_granules).map(move |rev_idx| {
573                let len = if rev_idx == 0 && rem != 0 {
574                    // The first allocated granule will be the last in the list.
575                    // Thus, `rev_idx == 0` is the last element and might not take up a full granule.
576                    rem
577                } else {
578                    VarLenGranule::DATA_SIZE
579                };
580                // Caller will initialize the granule's data for `len` bytes.
581                (<&[u8]>::default(), len)
582            })
583        })
584    }
585
586    /// Returns an iterator over all offsets of the `VarLenGranule`s of the var-len object
587    /// that has its first granule at offset `first_granule`.
588    /// An empty iterator will be returned when `first_granule` is `NULL`.
589    ///
590    /// # Safety
591    ///
592    /// `first_granule` must be an offset to a granule or `NULL`.
593    /// The data of the granule need not be initialized.
594    pub unsafe fn granule_offset_iter(&mut self, first_granule: PageOffset) -> GranuleOffsetIter<'_, 'page> {
595        GranuleOffsetIter {
596            next_granule: first_granule,
597            var_view: self,
598        }
599    }
600
601    /// Allocates and stores `slice` as a linked-list of granules
602    /// in the var-len storage of the page.
603    ///
604    /// Returns a [`VarLenRef`] pointing to the head of that list,
605    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
606    /// and the `slice` must be inserted into the large-blob store.
607    ///
608    /// # Safety post-requirements
609    ///
610    /// The following are the safety *post-requirements* of calling this method.
611    /// That is, this method is safe to call,
612    /// but may leave the page in an inconsistent state
613    /// which must be rectified before other **unsafe methods** may be called.
614    ///
615    /// 1. When the returned `in_blob` holds, caller must ensure that,
616    ///    before the granule's data is read from / assumed to be initialized,
617    ///    the granule pointed to by the returned `vlr.first_granule`
618    ///    has an initialized header and a data section initialized to at least
619    ///    as many bytes as claimed by the header.
620    pub fn alloc_for_slice(&mut self, slice: &[u8]) -> Result<(VarLenRef, bool), Error> {
621        let obj_len = slice.len();
622        // Safety post-requirement 2. of `alloc_for_obj_common` is already satisfied
623        // as `chunks(slice)` will return sub-slices where the sum is `obj_len`.
624        // Moreover, we initialize each granule already with the right data and length.
625        // The requirement 1. is forwarded to the caller.
626        let chunks = |_| VarLenGranule::chunks(slice).rev().map(|c| (c, c.len()));
627        self.alloc_for_obj_common(obj_len, chunks)
628    }
629
630    /// Allocates for `obj_len` bytes as a linked-list of granules
631    /// in the var-len storage of the page.
632    ///
633    /// For every granule in the aforementioned linked-list,
634    /// the caller must provide an element in the *reversed* iterator `chunks`,
635    /// and of pairs `(chunk, len)`.
636    /// To each granule `chunk` will be written and the granule will be of length `len`.
637    /// The caller can opt to provide `chunk` that is not of `len`.
638    ///
639    /// Returns a [`VarLenRef`] pointing to the head of that list,
640    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
641    /// and the `slice` must be inserted into the large-blob store.
642    ///
643    /// # Safety post-requirements
644    ///
645    /// The following are the safety *post-requirements* of calling this method.
646    /// That is, this method is safe to call,
647    /// but may leave the page in an inconsistent state
648    /// which must be rectified before other **unsafe methods** may be called.
649    ///
650    /// 1. When the returned `in_blob` holds, caller must ensure that,
651    ///    before the granule's data is read from / assumed to be initialized,
652    ///    the granule pointed to by the returned `vlr.first_granule`
653    ///    has an initialized header and a data section initialized to at least
654    ///    as many bytes as claimed by the header.
655    ///
656    /// 2. Otherwise, when `in_blob` doesn't hold the safety post-requirements are:
657    ///
658    ///    a. Let `cs = chunks(req_granules)` for the `req_granules` derived from `obj_len`.
659    ///       Then, `obj_len == cs.map(|(_, len)| len).sum()`.
660    ///
661    ///    b. For each `(_, len) ∈ cs`, caller must ensure that
662    ///       the relevant granule is initialized with data for at least `len`
663    ///       before the granule's data is read from / assumed to be initialized.
664    #[expect(clippy::doc_overindented_list_items)]
665    fn alloc_for_obj_common<'chunk, Cs: Iterator<Item = (&'chunk [u8], usize)>>(
666        &mut self,
667        obj_len: usize,
668        chunks: impl Copy + FnOnce(usize) -> Cs,
669    ) -> Result<(VarLenRef, bool), Error> {
670        // Check that we have sufficient space to allocate `obj_len` bytes in var-len data.
671        let (req_granules, enough_space, in_blob) = self.has_enough_space_for(obj_len);
672        if !enough_space {
673            return Err(Error::InsufficientVarLenSpace {
674                need: req_granules.try_into().unwrap_or(u16::MAX),
675                have: self.num_granules_available().try_into().unwrap_or(u16::MAX),
676            });
677        }
678
679        // For large blob objects, only reserve a granule.
680        // The caller promised that they will initialize it with a blob hash.
681        if in_blob {
682            let vlr = self.alloc_blob_hash()?;
683            return Ok((vlr, true));
684        };
685
686        // Write each `chunk` to var-len storage.
687        // To do this, we allocate granules for and store the chunks in reverse,
688        // starting with the end first.
689        // The offset to the previous granule in the iteration is kept to
690        // link it in as the next pointer in the current iteration.
691        let mut next = PageOffset::VAR_LEN_NULL;
692        debug_assert_eq!(obj_len, chunks(req_granules).map(|(_, len)| len).sum::<usize>());
693        for (chunk, len) in chunks(req_granules) {
694            // This should never error, since we already checked for available space.
695            let granule = self.alloc_granule()?;
696            // SAFETY:
697            // 1. `granule` is properly aligned as it came from `alloc_granule`
698            //    and so is `next` as it's either NULL or was the previous `granule`.
699            //    This also ensures that both are in bounds
700            //    of the page for `granule + granule + VarLenGranule::SIZE`.
701            //
702            // 2. `next` is either NULL or was initialized in the previous loop iteration.
703            //
704            // 3. `granule` points to an unused slot as the space was just allocated.
705            unsafe { self.write_chunk_to_granule(chunk, len, granule, next) };
706            next = granule;
707        }
708
709        Ok((
710            VarLenRef {
711                first_granule: next,
712                length_in_bytes: obj_len as u16,
713            },
714            false,
715        ))
716    }
717
718    /// Allocates a granule for a large blob object
719    /// and returns a [`VarLenRef`] pointing to that granule.
720    ///
721    /// The granule is not initialized by this method, and contains valid-unconstrained bytes.
722    /// It is the caller's responsibility to initialize it with a [`BlobHash`](super::blob_hash::BlobHash).
723    #[cold]
724    fn alloc_blob_hash(&mut self) -> Result<VarLenRef, Error> {
725        // Var-len hashes are 32 bytes, which fits within a single granule.
726        self.alloc_granule().map(VarLenRef::large_blob)
727    }
728
729    /// Inserts `var_len_obj` into `blob_store`
730    /// and stores the blob hash in the granule pointed to by `vlr.first_granule`.
731    ///
732    /// This insertion will never fail.
733    ///
734    /// # Safety
735    ///
736    /// `vlr.first_granule` must point to an unused `VarLenGranule` in bounds of this page,
737    /// which must be valid for writes.
738    pub unsafe fn write_large_blob_hash_to_granule(
739        &mut self,
740        blob_store: &mut dyn BlobStore,
741        var_len_obj: &impl AsRef<[u8]>,
742        vlr: VarLenRef,
743    ) -> BlobNumBytes {
744        let hash = blob_store.insert_blob(var_len_obj.as_ref());
745
746        let granule = vlr.first_granule;
747        // SAFETY:
748        // 1. `granule` is properly aligned for `VarLenGranule` and is in bounds of the page.
749        // 2. The null granule is trivially initialized.
750        // 3. The caller promised that `granule` is safe to overwrite.
751        unsafe { self.write_chunk_to_granule(&hash.data, hash.data.len(), granule, PageOffset::VAR_LEN_NULL) };
752        var_len_obj.as_ref().len().into()
753    }
754
755    /// Write the `chunk` (data) to the [`VarLenGranule`] pointed to by `granule`,
756    /// set the granule's length to be `len`,
757    /// and set the next granule in the list to `next`.
758    ///
759    /// SAFETY:
760    ///
761    /// 1. Both `granule` and `next` must be properly aligned pointers to [`VarLenGranule`]s
762    ///    and they must be in bounds of the page. However, neither need to point to init data.
763    ///
764    /// 2. The caller must initialize the granule pointed to by `next`
765    ///    before the granule-list is read from (e.g., iterated on).
766    ///    The null granule is considered trivially initialized.
767    ///
768    /// 3. The space pointed to by `granule` must be unused and valid for writes,
769    ///    and will be overwritten here.
770    unsafe fn write_chunk_to_granule(&mut self, chunk: &[u8], len: usize, granule: PageOffset, next: PageOffset) {
771        let granule = self.adjuster()(granule);
772        // SAFETY: A `PageOffset` is always in bounds of the page.
773        let ptr: *mut VarLenGranule = unsafe { offset_to_ptr_mut(self.var_row_data, granule).cast() };
774
775        // TODO(centril,bikeshedding): check if creating the `VarLenGranule` first on stack
776        // and then writing to `ptr` would have any impact on perf.
777        // This would be nicer as it requires less `unsafe`.
778
779        // We need to initialize `Page::header`
780        // without materializing a `&mut` as that is instant UB.
781        // SAFETY: `ptr` isn't NULL as `&mut self.row_data` itself is a non-null pointer.
782        let header = unsafe { &raw mut (*ptr).header };
783
784        // SAFETY: `header` is valid for writes as only we have exclusive access.
785        //          (1) The `ptr` was also promised as aligned
786        //          and `granule + (granule + 64 bytes)` is in bounds of the page per caller contract.
787        //          (2) Moreover, `next` will be an initialized granule per caller contract,
788        //          so we can link it into the list without causing UB elsewhere.
789        //          (3) It's also OK to write to `granule` as it's unused.
790        unsafe {
791            header.write(VarLenGranuleHeader::new(len as u8, next));
792        }
793
794        // SAFETY: We can treat any part of `row_data` as `.data`. Also (1) and (2).
795        let data = unsafe { &mut (*ptr).data };
796
797        // Copy the data into the granule.
798        data[0..chunk.len()].copy_from_slice(chunk);
799    }
800
801    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`].
802    ///
803    /// The allocated storage is not initialized by this method,
804    /// and will be valid-unconstrained at [`VarLenGranule`].
805    ///
806    /// This offset will be properly aligned for `VarLenGranule` when converted to a pointer.
807    ///
808    /// Returns an error when there are neither free granules nor space in the gap left.
809    fn alloc_granule(&mut self) -> Result<PageOffset, Error> {
810        let granule = self
811            .alloc_from_freelist()
812            .or_else(|| self.alloc_from_gap())
813            .ok_or(Error::InsufficientVarLenSpace { need: 1, have: 0 })?;
814
815        debug_assert!(
816            is_granule_offset_aligned(granule),
817            "Allocated an unaligned var-len granule: {granule:x}",
818        );
819
820        self.header.num_granules += 1;
821
822        Ok(granule)
823    }
824
825    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`]
826    /// taken from the freelist, if any.
827    #[inline]
828    fn alloc_from_freelist(&mut self) -> Option<PageOffset> {
829        // SAFETY: `header.next_free` points to a `c: FreeCellRef` when the former `.has()`.
830        let free = unsafe {
831            self.header
832                .next_free
833                .take_freelist_head(self.var_row_data, |o| o - self.last_fixed)
834        }?;
835        self.header.freelist_len -= 1;
836        Some(free)
837    }
838
839    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`]
840    /// taken from the gap, if there is space left, or `None` if there is insufficient space.
841    #[inline]
842    fn alloc_from_gap(&mut self) -> Option<PageOffset> {
843        if gap_enough_size_for_row(self.header.first, self.last_fixed, VarLenGranule::SIZE) {
844            // `var.first` points *at* the lowest-indexed var-len granule,
845            // *not* before it, so pre-decrement.
846            self.header.first -= VarLenGranule::SIZE;
847            Some(self.header.first)
848        } else {
849            None
850        }
851    }
852
853    /// Free a single var-len granule pointed to at by `offset`.
854    ///
855    /// SAFETY: `offset` must point to a valid [`VarLenGranule`].
856    #[inline]
857    unsafe fn free_granule(&mut self, offset: PageOffset) {
858        // TODO(perf,future-work): if `chunk` is at the HWM, return it to the gap.
859        //       Returning a single chunk to the gap is easy,
860        //       but we want to return a whole "run" of sequential freed chunks,
861        //       which requires some bookkeeping (or an O(> n) linked list traversal).
862        self.header.freelist_len += 1;
863        self.header.num_granules -= 1;
864        let adjuster = self.adjuster();
865
866        // SAFETY: Per caller contract, `offset` is a valid `VarLenGranule`,
867        // and is therefore in bounds of the page row data.
868        // By `_VLG_CAN_STORE_FCR`, and as we won't be reading from the granule anymore,
869        // we know that this makes it valid for writing a `FreeCellRef` to it.
870        // Moreover, by `_VLG_ALIGN_MULTIPLE_OF_FCR`,
871        // the derived pointer is properly aligned (64) for a granule
872        // and as `64 % 2 == 0` the alignment of a granule works for a `FreeCellRef`.
873        // Finally, `self.header.next_free` contains a valid `FreeCellRef`.
874        unsafe {
875            self.header
876                .next_free
877                .prepend_freelist(self.var_row_data, offset, adjuster)
878        };
879    }
880
881    /// Returns a reference to the granule at `offset`.
882    ///
883    /// SAFETY: `offset` must point to a valid [`VarLenGranule`].
884    unsafe fn get_granule_ref(&self, offset: PageOffset) -> &VarLenGranule {
885        unsafe { get_ref(self.var_row_data, self.adjuster()(offset)) }
886    }
887
888    /// Frees the blob pointed to by the [`BlobHash`] stored in the granule at `offset`.
889    ///
890    /// Panics when `offset` is NULL.
891    ///
892    /// SAFETY: `offset` must point to a valid [`VarLenGranule`] or be NULL.
893    #[cold]
894    #[inline(never)]
895    unsafe fn free_blob(&self, offset: PageOffset, blob_store: &mut dyn BlobStore) -> BlobNumBytes {
896        assert!(!offset.is_var_len_null());
897
898        // SAFETY: Per caller contract + the assertion above,
899        // we know `offset` refers to a valid `VarLenGranule`.
900        let granule = unsafe { self.get_granule_ref(offset) };
901
902        // Actually free the blob.
903        let hash = granule.blob_hash();
904
905        // The size of `deleted_bytes` is calculated here instead of requesting it from `blob_store`.
906        // This is because the actual number of bytes deleted depends on the `blob_store`'s logic.
907        // We prefer to measure it from the datastore's point of view.
908        let blob_store_deleted_bytes = blob_store
909            .retrieve_blob(&hash)
910            .expect("failed to free var-len blob")
911            .len()
912            .into();
913
914        // Actually free the blob.
915        blob_store.free_blob(&hash).expect("failed to free var-len blob");
916
917        blob_store_deleted_bytes
918    }
919
920    /// Frees an entire var-len linked-list object.
921    ///
922    /// If the `var_len_obj` is a large blob,
923    /// the `VarLenGranule` which stores its blob hash will be freed from the page,
924    /// but the blob itself will not be freed from the blob store.
925    /// If used incorrectly, this may leak large blobs.
926    ///
927    /// This behavior is used to roll-back on failure in `[crate::bflatn::ser::write_av_to_page]`,
928    /// where inserting large blobs is deferred until all allocations succeed.
929    /// Freeing a fully-inserted object should instead use [`Self::free_object`].
930    ///
931    /// # Safety
932    ///
933    /// `var_len_obj.first_granule` must point to a valid [`VarLenGranule`] or be NULL.
934    pub unsafe fn free_object_ignore_blob(&mut self, var_len_obj: VarLenRef) {
935        let mut next_granule = var_len_obj.first_granule;
936
937        while !next_granule.is_var_len_null() {
938            // SAFETY: Per caller contract, `first_granule` points to a valid granule or is NULL.
939            // We know however at this point that it isn't NULL so it is valid.
940            // Thus the successor is too a valid granule or NULL.
941            // However, again, at this point we know that the successor isn't NULL.
942            // It follows then by induction that any `next_granule` at this point is valid.
943            // Thus we have fulfilled the requirement that `next_granule` points to a valid granule.
944            let header = unsafe { self.get_granule_ref(next_granule) }.header;
945            // SAFETY: `next_granule` still points to a valid granule per above.
946            unsafe {
947                self.free_granule(next_granule);
948            }
949            next_granule = header.next();
950        }
951    }
952
953    /// Frees an entire var-len linked-list object.
954    ///
955    /// SAFETY: `var_len_obj.first_granule` must point to a valid [`VarLenGranule`] or be NULL.
956    unsafe fn free_object(&mut self, var_len_obj: VarLenRef, blob_store: &mut dyn BlobStore) -> BlobNumBytes {
957        let mut blob_store_deleted_bytes = BlobNumBytes::default();
958        // For large blob objects, extract the hash and tell `blob_store` to discard it.
959        if var_len_obj.is_large_blob() {
960            // SAFETY: `var_len_obj.first_granule` was promised to
961            // point to a valid [`VarLenGranule`] or be NULL, as required.
962            unsafe {
963                blob_store_deleted_bytes = self.free_blob(var_len_obj.first_granule, blob_store);
964            }
965        }
966
967        // SAFETY: `free_object_ignore_blob` has the same safety contract as this method.
968        unsafe {
969            self.free_object_ignore_blob(var_len_obj);
970        }
971
972        blob_store_deleted_bytes
973    }
974}
975
976/// An iterator yielding the offsets to the granules of a var-len object.
977pub struct GranuleOffsetIter<'vv, 'page> {
978    /// Our mutable view of the page.
979    var_view: &'vv mut VarView<'page>,
980    /// The offset, that will be yielded next, pointing to next granule.
981    next_granule: PageOffset,
982}
983
984impl GranuleOffsetIter<'_, '_> {
985    /// Returns a mutable view of, for the `granule` at `offset`, `granule.data[start..]`.
986    ///
987    /// # Safety
988    ///
989    /// - `offset` must point to a valid granule
990    /// - `start < VarLenGranule::DATA_SIZE`
991    pub unsafe fn get_mut_data(&mut self, offset: PageOffset, start: usize) -> &mut Bytes {
992        // SAFETY: Caller promised that `offset` points o a valid granule.
993        let granule: &mut VarLenGranule = unsafe { get_mut(self.var_view.var_row_data, offset) };
994        // SAFETY: Caller promised `start < granule.data.len()`.
995        unsafe { granule.data.as_mut_slice().get_unchecked_mut(start..) }
996    }
997}
998
999impl Iterator for GranuleOffsetIter<'_, '_> {
1000    type Item = PageOffset;
1001    fn next(&mut self) -> Option<Self::Item> {
1002        let adjust = self.var_view.adjuster();
1003
1004        if self.next_granule.is_var_len_null() {
1005            return None;
1006        }
1007        let ret = adjust(self.next_granule);
1008        // SAFETY: By construction,
1009        // the initial `next_granule` was promised to either be `NULL` or point to a valid granule.
1010        // For a given granule, the same applies to its `.next()` granule.
1011        // At this point, we've excluded `NULL`,
1012        // so we know inductively that `next_granule` points to a valid granule, as required.
1013        let granule: &VarLenGranule = unsafe { get_ref(self.var_view.var_row_data, ret) };
1014        self.next_granule = granule.header.next();
1015
1016        Some(ret)
1017    }
1018}
1019
1020/// Assert that `ptr` is sufficiently aligned to reference a value of `T`.
1021///
1022/// In release mode, this is a no-op.
1023fn assert_alignment<T>(ptr: *const Byte) {
1024    debug_assert_eq!(
1025        ptr as usize % mem::align_of::<T>(),
1026        0,
1027        "Wanted a PageOffset with align 0x{:x} (for {}) but found 0x{:x}",
1028        mem::align_of::<T>(),
1029        std::any::type_name::<T>(),
1030        ptr as usize,
1031    );
1032}
1033
1034/// Returns a reference to the [`T`] pointed to at by `offset`.
1035///
1036/// # Safety
1037///
1038/// `offset` must point to a valid `T` in `row_data`.
1039#[inline]
1040pub unsafe fn get_ref<T>(row_data: &Bytes, offset: PageOffset) -> &T {
1041    // SAFETY: Caller promised that `offset` is in bounds of `row_data`.
1042    let ptr = unsafe { offset_to_ptr(row_data, offset) };
1043    assert_alignment::<T>(ptr);
1044    let ptr = ptr.cast::<T>();
1045    // SAFETY: Caller promised that `offset` points to a `T` in `row_data`.
1046    unsafe { &*ptr }
1047}
1048
1049/// Returns a mutable reference to the [`T`] pointed to at by `offset`.
1050///
1051/// # Safety
1052///
1053/// `offset` must point to a valid `T` in `row_data`.
1054#[inline]
1055unsafe fn get_mut<T>(row_data: &mut Bytes, offset: PageOffset) -> &mut T {
1056    // SAFETY: Caller promised that `offset` is in bounds of `row_data`.
1057    let ptr = unsafe { offset_to_ptr_mut(row_data, offset) };
1058    assert_alignment::<T>(ptr as *const Byte);
1059    let ptr = ptr.cast::<T>();
1060    // SAFETY: Caller promised that `offset` points to a `T` in `row_data`.
1061    unsafe { &mut *ptr }
1062}
1063
1064/// Returns a raw const pointer into the `row_data` at `offset` bytes.
1065///
1066/// # Safety
1067///
1068/// `offset` must be in bounds or one past end of `row_data`.
1069#[inline]
1070unsafe fn offset_to_ptr(row_data: &Bytes, offset: PageOffset) -> *const Byte {
1071    debug_assert!(offset.idx() <= row_data.len());
1072
1073    // SAFETY: per caller contract, `offset` is in bounds or one past end of `row_data`.
1074    unsafe { row_data.as_ptr().add(offset.idx()) }
1075}
1076
1077/// Returns a raw mutable pointer into the `row_data` at `offset` bytes.
1078///
1079/// SAFETY: `offset` must be in bounds or one past end of `row_data`.
1080#[inline]
1081unsafe fn offset_to_ptr_mut(row_data: &mut Bytes, offset: PageOffset) -> *mut Byte {
1082    debug_assert!(offset.idx() <= row_data.len());
1083
1084    // SAFETY: per caller contract, `offset` is in bounds or one past end of `row_data`.
1085    unsafe { row_data.as_mut_ptr().add(offset.idx()) }
1086}
1087
1088/// Returns the size of the gap,
1089/// assuming `first_var` is the high water mark (HWM) of the var-len section,
1090/// pointing *at* the granule with the lowest offset,
1091/// and `last_fixed` is the HWM of the fixed-len section,
1092/// pointing *one past the end* of the last fixed row.
1093#[inline]
1094fn gap_remaining_size(first_var: PageOffset, last_fixed: PageOffset) -> Size {
1095    // For illustration, suppose `row_data` is 10 bytes, i.e., `[Byte; 10]`.
1096    // Let's assume the following setup with a full page,
1097    // where capital letters are fixed rows and lower case are variable.
1098    //
1099    // [ A, B, C, D, E, f, g, h, i, j ]
1100    //                  ^
1101    //               first_var
1102    //                  ^
1103    //               last_fixed
1104    //
1105    // The high water mark `first_var` points *at* the granule with the lowest offset (`f`).
1106    // Whereas `last_fixed` points *one past the end* (`f`) of the last fixed row (`E`)
1107    //
1108    // This is the case we have to consider in terms of possible underflow.
1109    // As both HWMs would point at the same place,
1110    // the result would be `0`, and no underflow occurs.
1111    Size((first_var - last_fixed).0)
1112}
1113
1114/// Returns whether the remaining gap is large enough to host an object `fixed_row_size` large,
1115/// assuming `first_var` is the high water mark (HWM) of the var-len section,
1116/// pointing *at* the granule with the lowest offset,
1117/// and `last_fixed` is the HWM of the fixed-len section,
1118/// pointing *one past the end* of the last fixed row.
1119#[inline]
1120fn gap_enough_size_for_row(first_var: PageOffset, last_fixed: PageOffset, fixed_row_size: Size) -> bool {
1121    gap_remaining_size(first_var, last_fixed) >= fixed_row_size
1122}
1123
1124impl Page {
1125    /// Returns a new page allocated on the heap.
1126    ///
1127    /// The new page supports a rows with `fixed_row_size`.
1128    pub fn new(fixed_row_size: Size) -> Box<Self> {
1129        Self::new_with_max_row_count(max_rows_in_page(fixed_row_size))
1130    }
1131
1132    /// Returns a new page allocated on the heap.
1133    ///
1134    /// The new page supports `max_rows_in_page` at most.
1135    pub fn new_with_max_row_count(max_rows_in_page: usize) -> Box<Self> {
1136        // TODO(perf): mmap? allocator may do so already.
1137        // mmap may be more efficient as we save allocator metadata.
1138        use std::alloc::{alloc_zeroed, handle_alloc_error, Layout};
1139
1140        let layout = Layout::new::<Page>();
1141
1142        // Allocate with `alloc_zeroed` so that the bytes are initially 0, rather than uninit.
1143        // We will never write an uninit byte into the page except in the `PageHeader`,
1144        // so it is safe for `row_data` to have type `[u8; _]` rather than `[MaybeUninit<u8>; _]`.
1145        // `alloc_zeroed` may be more efficient than `alloc` + `memset`;
1146        // in particular, it may `mmap` pages directly from the OS, which are always zeroed for security reasons.
1147        // TODO: use Box::new_zeroed() once stabilized.
1148        // SAFETY: The layout's size is non-zero.
1149        let raw: *mut Page = unsafe { alloc_zeroed(layout) }.cast();
1150
1151        if raw.is_null() {
1152            handle_alloc_error(layout);
1153        }
1154
1155        // We need to initialize `Page::header`
1156        // without materializing a `&mut` as that is instant UB.
1157        // SAFETY: `raw` isn't NULL.
1158        let header = unsafe { &raw mut (*raw).header };
1159
1160        // SAFETY: `header` is valid for writes as only we have exclusive access.
1161        //          The pointer is also aligned.
1162        unsafe { header.write(PageHeader::new(max_rows_in_page)) };
1163
1164        // SAFETY: We used the global allocator with a layout for `Page`.
1165        //         We have initialized the `header`,
1166        //         and the `row_bytes` are initially 0 by `alloc_zeroed`,
1167        //         making the pointee a `Page` valid for reads and writes.
1168        unsafe { Box::from_raw(raw) }
1169    }
1170
1171    /// Returns the number of rows stored in this page.
1172    ///
1173    /// This method runs in constant time.
1174    pub fn num_rows(&self) -> usize {
1175        self.header.fixed.num_rows as usize
1176    }
1177
1178    #[cfg(test)]
1179    /// Use this page's present rows bitvec to compute the number of present rows.
1180    ///
1181    /// This can be compared with [`Self::num_rows`] as a consistency check during tests.
1182    pub fn reconstruct_num_rows(&self) -> usize {
1183        // If we cared, we could rewrite this to `u64::count_ones` on each block of the bitset.
1184        // We do not care. This method is slow.
1185        self.header.fixed.present_rows.iter_set().count()
1186    }
1187
1188    /// Returns the number of var-len granules allocated in this page.
1189    ///
1190    /// This method runs in constant time.
1191    pub fn num_var_len_granules(&self) -> usize {
1192        self.header.var.num_granules as usize
1193    }
1194
1195    #[cfg(test)]
1196    /// # Safety
1197    ///
1198    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
1199    ///   specialized to the type and layout of rows within this [`Page`].
1200    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
1201    ///   which must further be the length of rows expected by the `var_len_visitor`.
1202    pub unsafe fn reconstruct_num_var_len_granules(
1203        &self,
1204        fixed_row_size: Size,
1205        var_len_visitor: &impl VarLenMembers,
1206    ) -> usize {
1207        self.iter_fixed_len(fixed_row_size)
1208            .flat_map(|row| unsafe {
1209                // Safety: `row` came out of `iter_fixed_len`,
1210                // which, due to caller requirements on `fixed_row_size`,
1211                // is giving us valid, aligned, initialized rows of the row type.
1212                var_len_visitor.visit_var_len(self.get_row_data(row, fixed_row_size))
1213            })
1214            .flat_map(|var_len_obj| unsafe {
1215                // Safety: We believe `row` to be valid
1216                // and `var_len_visitor` to be correctly visiting its var-len members.
1217                // Therefore, `var_len_obj` is a valid var-len object.
1218                self.iter_var_len_object(var_len_obj.first_granule)
1219            })
1220            .count()
1221    }
1222
1223    /// Returns the number of bytes used by rows stored in this page.
1224    ///
1225    /// This is necessarily an overestimate of live data bytes, as it includes:
1226    /// - Padding bytes within the fixed-length portion of the rows.
1227    /// - [`VarLenRef`] pointer-like portions of rows.
1228    /// - Unused trailing parts of partially-filled [`VarLenGranule`]s.
1229    /// - [`VarLenGranule`]s used to store [`BlobHash`]es.
1230    ///
1231    /// Note that large blobs themselves are not counted.
1232    /// The caller should obtain a count of the bytes used by large blobs
1233    /// from the [`super::blob_store::BlobStore`].
1234    ///
1235    /// This method runs in constant time.
1236    pub fn bytes_used_by_rows(&self, fixed_row_size: Size) -> usize {
1237        let fixed_row_bytes = self.num_rows() * fixed_row_size.len();
1238        let var_len_bytes = self.num_var_len_granules() * VarLenGranule::SIZE.len();
1239        fixed_row_bytes + var_len_bytes
1240    }
1241
1242    #[cfg(test)]
1243    /// # Safety
1244    ///
1245    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
1246    ///   specialized to the type and layout of rows within this [`Page`].
1247    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
1248    ///   which must further be the length of rows expected by the `var_len_visitor`.
1249    pub unsafe fn reconstruct_bytes_used_by_rows(
1250        &self,
1251        fixed_row_size: Size,
1252        var_len_visitor: &impl VarLenMembers,
1253    ) -> usize {
1254        let fixed_row_bytes = self.reconstruct_num_rows() * fixed_row_size.len();
1255        let var_len_bytes = unsafe { self.reconstruct_num_var_len_granules(fixed_row_size, var_len_visitor) }
1256            * VarLenGranule::SIZE.len();
1257        fixed_row_bytes + var_len_bytes
1258    }
1259
1260    /// Returns the range of row data starting at `offset` and lasting `size` bytes.
1261    pub fn get_row_data(&self, row: PageOffset, size: Size) -> &Bytes {
1262        &self.row_data[row.range(size)]
1263    }
1264
1265    /// Returns whether the row at `offset` is present or not.
1266    pub fn has_row_offset(&self, fixed_row_size: Size, offset: PageOffset) -> bool {
1267        // Check that the `offset` is properly aligned for a row of size `fixed_row_size`.
1268        // This cannot be `debug_assert!` as the caller could rely on this
1269        // reporting properly whether `offset` is at a row boundary or not.
1270        assert_eq!(offset.idx() % fixed_row_size.len(), 0);
1271
1272        self.header.fixed.is_row_present(offset, fixed_row_size)
1273    }
1274
1275    /// Returns split mutable views of this page over the fixed and variable sections.
1276    pub fn split_fixed_var_mut(&mut self) -> (FixedView<'_>, VarView<'_>) {
1277        // The fixed HWM (`fixed.last`) points *one past the end* of the fixed section
1278        // which is exactly what we want for `split_at_mut`.
1279        let last_fixed = self.header.fixed.last;
1280        let (fixed_row_data, var_row_data) = self.row_data.split_at_mut(last_fixed.idx());
1281
1282        // Construct the fixed-len view.
1283        let fixed = FixedView {
1284            fixed_row_data,
1285            header: &mut self.header.fixed,
1286        };
1287
1288        // Construct the var-len view.
1289        let var = VarView {
1290            var_row_data,
1291            header: &mut self.header.var,
1292            last_fixed,
1293        };
1294
1295        (fixed, var)
1296    }
1297
1298    /// Returns a mutable view of the row from `start` lasting `fixed_row_size` number of bytes.
1299    ///
1300    /// This method is safe, but callers should take care that `start` and `fixed_row_size`
1301    /// are correct for this page, and that `start` is aligned.
1302    /// Callers should further ensure that mutations to the row leave the row bytes
1303    /// in an expected state, i.e. initialized where required by the row type,
1304    /// and with `VarLenRef`s that point to valid granules and with correct lengths.
1305    ///
1306    /// This call will clear the unmodified hash
1307    /// as it is expected that the caller will alter the the page.
1308    pub fn get_fixed_row_data_mut(&mut self, start: PageOffset, fixed_row_size: Size) -> &mut Bytes {
1309        self.header.unmodified_hash = None;
1310        &mut self.row_data[start.range(fixed_row_size)]
1311    }
1312
1313    /// Return the total required var-len granules to store `objects`.
1314    pub fn total_granules_required_for_objects(objects: &[impl AsRef<[u8]>]) -> usize {
1315        objects
1316            .iter()
1317            .map(|obj| VarLenGranule::bytes_to_granules(obj.as_ref().len()).0)
1318            .sum()
1319    }
1320
1321    /// Does the page have space to store a row,
1322    /// where the fixed size part is `fixed_row_size` bytes large,
1323    /// and the row has the given `var_len_objects`?
1324    pub fn has_space_for_row_with_objects(&self, fixed_row_size: Size, var_len_objects: &[impl AsRef<[u8]>]) -> bool {
1325        let num_granules_required = Self::total_granules_required_for_objects(var_len_objects);
1326        self.has_space_for_row(fixed_row_size, num_granules_required)
1327    }
1328
1329    /// Does the page have space to store a row,
1330    /// where the fixed size part is `fixed_row_size` bytes large,
1331    /// and the variable part requires `num_granules`.
1332    pub fn has_space_for_row(&self, fixed_row_size: Size, num_granules: usize) -> bool {
1333        // Determine the gap remaining after allocating for the fixed part.
1334        let gap_remaining = gap_remaining_size(self.header.var.first, self.header.fixed.last);
1335        let gap_avail_for_granules = if self.header.fixed.next_free.has() {
1336            // If we have a free fixed length block, then we can use the whole gap for var-len granules.
1337            gap_remaining
1338        } else {
1339            // If we need to grow the fixed-length store into the gap,
1340            if gap_remaining < fixed_row_size {
1341                // if the gap is too small for fixed-length row, fail.
1342                return false;
1343            }
1344            // Otherwise, the space available in the gap for var-len granules
1345            // is the current gap size less the fixed-len row size.
1346            gap_remaining - fixed_row_size
1347        };
1348
1349        // Convert the gap size to granules.
1350        let gap_in_granules = VarLenGranule::space_to_granules(gap_avail_for_granules);
1351        // Account for granules available in the freelist.
1352        let needed_granules_after_freelist = num_granules.saturating_sub(self.header.var.freelist_len as usize);
1353
1354        gap_in_granules >= needed_granules_after_freelist
1355    }
1356
1357    /// Returns whether the row is full with respect to storing a fixed row with `fixed_row_size`
1358    /// and no variable component.
1359    pub fn is_full(&self, fixed_row_size: Size) -> bool {
1360        !self.has_space_for_row(fixed_row_size, 0)
1361    }
1362
1363    /// Will leave partially-allocated chunks if fails prematurely,
1364    /// so always check `Self::has_space_for_row` before calling.
1365    ///
1366    /// This method is provided for testing the page store directly;
1367    /// higher-level codepaths are expected to use [`crate::bflatn::ser::write_av_to_page`],
1368    /// which performs similar operations to this method,
1369    /// but handles rollback on failure appropriately.
1370    ///
1371    /// This function will never fail if `Self::has_space_for_row` has returned true.
1372    ///
1373    /// # Safety
1374    ///
1375    /// - `var_len_visitor` is suitable for visiting var-len refs in `fixed_row`.
1376    ///
1377    /// - `fixed_row.len()` must be consistent with `var_len_visitor` and `self`.
1378    ///   That is, `VarLenMembers` must be specialized for a row type with that length,
1379    ///   and all past, present, and future fixed-length rows stored in this `Page`
1380    ///   must also be of that length.
1381    pub unsafe fn insert_row(
1382        &mut self,
1383        fixed_row: &Bytes,
1384        var_len_objects: &[impl AsRef<[u8]>],
1385        var_len_visitor: &impl VarLenMembers,
1386        blob_store: &mut dyn BlobStore,
1387    ) -> Result<PageOffset, Error> {
1388        // Allocate the fixed-len row.
1389        let fixed_row_size = Size(fixed_row.len() as u16);
1390
1391        // SAFETY: Caller promised that `fixed_row.len()` uses the right `fixed_row_size`
1392        // and we trust that others have too.
1393        let fixed_len_offset = unsafe { self.alloc_fixed_len(fixed_row_size)? };
1394
1395        // Store the fixed-len row.
1396        let (mut fixed, mut var) = self.split_fixed_var_mut();
1397        let row = fixed.get_row_mut(fixed_len_offset, fixed_row_size);
1398        row.copy_from_slice(fixed_row);
1399
1400        // Store all var-len refs into their appropriate slots in the fixed-len row.
1401        // SAFETY:
1402        // - The `fixed_len_offset` given by `alloc_fixed_len` results in `row`
1403        //   being properly aligned for the row type.
1404        // - Caller promised that `fixed_row.len()` matches the row type size exactly.
1405        // - `var_len_visitor` is suitable for `fixed_row`.
1406        let vlr_slot_iter = unsafe { var_len_visitor.visit_var_len_mut(row) };
1407        for (var_len_ref_slot, var_len_obj) in vlr_slot_iter.zip(var_len_objects) {
1408            let (var_len_ref, in_blob) = var.alloc_for_slice(var_len_obj.as_ref())?;
1409            if in_blob {
1410                // The blob store insertion will never fail.
1411                // SAFETY: `alloc_for_slice` always returns a pointer
1412                // to a `VarLenGranule` in bounds of this page.
1413                // As `in_blob` holds, it is also unused, as required.
1414                // We'll now make that granule valid.
1415                unsafe {
1416                    var.write_large_blob_hash_to_granule(blob_store, var_len_obj, var_len_ref);
1417                }
1418            }
1419            *var_len_ref_slot = var_len_ref;
1420        }
1421
1422        Ok(fixed_len_offset)
1423    }
1424
1425    /// Allocates space for a fixed size row of `fixed_row_size` bytes.
1426    ///
1427    /// # Safety
1428    ///
1429    /// `fixed_row_size` must be equal to the value passed
1430    /// to all other methods ever invoked on `self`.
1431    pub unsafe fn alloc_fixed_len(&mut self, fixed_row_size: Size) -> Result<PageOffset, Error> {
1432        self.alloc_fixed_len_from_freelist(fixed_row_size)
1433            .or_else(|| self.alloc_fixed_len_from_gap(fixed_row_size))
1434            .ok_or(Error::InsufficientFixedLenSpace { need: fixed_row_size })
1435    }
1436
1437    /// Allocates a space for a fixed size row of `fixed_row_size` in the freelist, if possible.
1438    ///
1439    /// This call will clear the unmodified hash.
1440    #[inline]
1441    fn alloc_fixed_len_from_freelist(&mut self, fixed_row_size: Size) -> Option<PageOffset> {
1442        let header = &mut self.header.fixed;
1443        // SAFETY: `header.next_free` points to a `FreeCellRef` when the former `.has()`.
1444        let free = unsafe { header.next_free.take_freelist_head(&self.row_data, |x| x) }?;
1445        header.set_row_present(free, fixed_row_size);
1446
1447        // We are and have modified the page, so clear the unmodified hash.
1448        self.header.unmodified_hash = None;
1449
1450        Some(free)
1451    }
1452
1453    /// Allocates a space for a fixed size row of `fixed_row_size` in the freelist, if possible.
1454    ///
1455    /// This call will clear the unmodified hash.
1456    #[inline]
1457    fn alloc_fixed_len_from_gap(&mut self, fixed_row_size: Size) -> Option<PageOffset> {
1458        if gap_enough_size_for_row(self.header.var.first, self.header.fixed.last, fixed_row_size) {
1459            // We're modifying the page, so clear the unmodified hash.
1460            self.header.unmodified_hash = None;
1461
1462            // Enough space in the gap; move the high water mark and return the old HWM.
1463            // `fixed.last` points *after* the highest-indexed fixed-len row,
1464            // so post-increment.
1465            let ptr = self.header.fixed.last;
1466            self.header.fixed.last += fixed_row_size;
1467            self.header.fixed.set_row_present(ptr, fixed_row_size);
1468            Some(ptr)
1469        } else {
1470            // Not enough space in the gap for another row!
1471            None
1472        }
1473    }
1474
1475    /// Returns an iterator over all the [`PageOffset`]s of the fixed rows in this page
1476    /// beginning with `starting_from`.
1477    ///
1478    /// The rows are assumed to be `fixed_row_size` bytes long
1479    /// and `starting_from` is assumed to be at a valid starting `PageOffset` for a fixed row.
1480    ///
1481    /// NOTE: This method is not `unsafe` as it cannot trigger UB.
1482    /// However, when provided with garbage input, it will return garbage back.
1483    /// It is the caller's responsibility to ensure that `PageOffset`s derived from
1484    /// this iterator are valid when used to do anything `unsafe`.
1485    fn iter_fixed_len_from(&self, fixed_row_size: Size, starting_from: PageOffset) -> FixedLenRowsIter<'_> {
1486        let idx = starting_from / fixed_row_size;
1487        FixedLenRowsIter {
1488            idx_iter: self.header.fixed.present_rows.iter_set_from(idx),
1489            fixed_row_size,
1490        }
1491    }
1492
1493    /// Returns an iterator over all the [`PageOffset`]s of the fixed rows in this page.
1494    ///
1495    /// The rows are assumed to be `fixed_row_size` bytes long.
1496    ///
1497    /// NOTE: This method is not `unsafe` as it cannot trigger UB.
1498    /// However, when provided with garbage input, it will return garbage back.
1499    /// It is the caller's responsibility to ensure that `PageOffset`s derived from
1500    /// this iterator are valid when used to do anything `unsafe`.
1501    pub fn iter_fixed_len(&self, fixed_row_size: Size) -> FixedLenRowsIter<'_> {
1502        FixedLenRowsIter {
1503            idx_iter: self.header.fixed.present_rows.iter_set(),
1504            fixed_row_size,
1505        }
1506    }
1507
1508    /// Returns an iterator over all the `VarLenGranule`s of the var-len object
1509    /// that has its first granule at offset `first_granule`.
1510    /// An empty iterator will be returned when `first_granule` is `NULL`.
1511    ///
1512    /// # Safety
1513    ///
1514    /// `first_granule` must be an offset to a valid granule or `NULL`.
1515    pub unsafe fn iter_var_len_object(
1516        &self,
1517        first_granule: PageOffset,
1518    ) -> impl Clone + Iterator<Item = &VarLenGranule> {
1519        VarLenGranulesIter {
1520            page: self,
1521            next_granule: first_granule,
1522        }
1523    }
1524
1525    /// Returns an iterator over the data of all the `VarLenGranule`s of the var-len object
1526    /// that has its first granule at offset `first_granule`.
1527    /// An empty iterator will be returned when `first_granule` is `NULL`.
1528    ///
1529    /// # Safety
1530    ///
1531    /// `first_granule` must be an offset to a valid granule or `NULL`.
1532    pub unsafe fn iter_vlo_data(&self, first_granule: PageOffset) -> impl '_ + Clone + Iterator<Item = &[u8]> {
1533        // SAFETY: Caller and callee have the exact same safety requirements.
1534        unsafe { self.iter_var_len_object(first_granule) }.map(|g| g.data())
1535    }
1536
1537    /// Free a row, marking its fixed-len and var-len storage granules as available for re-use.
1538    ///
1539    /// This call will clear the unmodified hash.
1540    ///
1541    /// # Safety
1542    ///
1543    /// - `fixed_row` must point to a valid row in this page.
1544    ///
1545    /// - `fixed_row_size` must be the size in bytes of the fixed part
1546    ///   of all past, present, and future rows in this page and future rows in this page.
1547    ///
1548    /// - The `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1549    ///   as the visitor provided to `insert_row`.
1550    pub unsafe fn delete_row(
1551        &mut self,
1552        fixed_row: PageOffset,
1553        fixed_row_size: Size,
1554        var_len_visitor: &impl VarLenMembers,
1555        blob_store: &mut dyn BlobStore,
1556    ) -> BlobNumBytes {
1557        // We're modifying the page, so clear the unmodified hash.
1558        self.header.unmodified_hash = None;
1559
1560        let (mut fixed, mut var) = self.split_fixed_var_mut();
1561
1562        let mut blob_store_deleted_bytes = BlobNumBytes::default();
1563
1564        // Visit the var-len members of the fixed row and free them.
1565        let row = fixed.get_row(fixed_row, fixed_row_size);
1566        // SAFETY: `row` is derived from `fixed_row`, which is known by caller requirements to be valid.
1567        let var_len_refs = unsafe { var_len_visitor.visit_var_len(row) };
1568        for var_len_ref in var_len_refs {
1569            // SAFETY: A sound call to `visit_var_len` on a fully initialized valid row,
1570            // which we've justified that the above is,
1571            // returns an iterator, that will only yield `var_len_ref`s,
1572            // where `var_len_ref.first_granule` points to a valid `VarLenGranule` or is NULL.
1573            blob_store_deleted_bytes += unsafe { var.free_object(*var_len_ref, blob_store) }
1574        }
1575
1576        // SAFETY: Caller promised that `fixed_row` points to a valid row in the page.
1577        // Thus, `range_move(0..fixed_row_size, fixed_row)` is in bounds of `row_data`.
1578        // Moreover, this entails that it is valid for writing a `FreeCellRef`
1579        // to the beginning or entire range, as any row can at least hold a `FreeCellRef`
1580        // and will be properly aligned for it as well.
1581        unsafe {
1582            fixed.free(fixed_row, fixed_row_size);
1583        }
1584
1585        blob_store_deleted_bytes
1586    }
1587
1588    /// Returns the total number of granules used by the fixed row at `fixed_row_offset`
1589    /// and lasting `fixed_row_size` bytes where `var_len_visitor` is used to find
1590    /// the [`VarLenRef`]s in the fixed row.
1591    ///
1592    /// # Safety
1593    ///
1594    /// - `fixed_row_offset` must refer to a previously-allocated and initialized row in `self`,
1595    ///   and must not have been de-allocated. In other words, the fixed row must be *valid*.
1596    ///
1597    /// - `fixed_row_size` and `var_len_visitor` must be consistent with each other
1598    ///   and with all other calls to any methods on `self`.
1599    pub unsafe fn row_total_granules(
1600        &self,
1601        fixed_row_offset: PageOffset,
1602        fixed_row_size: Size,
1603        var_len_visitor: &impl VarLenMembers,
1604    ) -> usize {
1605        let fixed_row = self.get_row_data(fixed_row_offset, fixed_row_size);
1606        // SAFETY:
1607        // - Caller promised that `fixed_row_offset` is a valid row.
1608        // - Caller promised consistency of `var_len_visitor` wrt. `fixed_row_size` and this page.
1609        let vlr_iter = unsafe { var_len_visitor.visit_var_len(fixed_row) };
1610        vlr_iter.copied().map(|slot| slot.granules_used()).sum()
1611    }
1612
1613    /// Copy as many rows from `self` for which `filter` returns `true` into `dst` as will fit,
1614    /// starting from `starting_from`.
1615    ///
1616    /// If less than the entirety of `self` could be processed, return `Continue(resume_point)`,
1617    /// where `resume_point` is the `starting_from` argument of a subsequent call to `copy_filter_into`
1618    /// that will complete the iteration.
1619    /// `dst` should be assumed to be full in this case,
1620    /// as it does not contain enough free space to store the row of `self` at `resume_point`.
1621    ///
1622    /// If the entirety of `self` is processed, return `Break`.
1623    /// `dst` may or may not be full in this case, but is likely not full.
1624    ///
1625    /// # Safety
1626    ///
1627    /// The `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1628    /// as the visitor provided to all other methods on `self` and `dst`.
1629    ///
1630    /// The `fixed_row_size` must be consistent with the `var_len_visitor`,
1631    /// and be equal to the value provided to all other methods on `self` and `dst`.
1632    ///
1633    /// The `starting_from` offset must point to a valid starting offset
1634    /// consistent with `fixed_row_size`.
1635    /// That is, it must not point into the middle of a row.
1636    pub unsafe fn copy_filter_into(
1637        &self,
1638        starting_from: PageOffset,
1639        dst: &mut Page,
1640        fixed_row_size: Size,
1641        var_len_visitor: &impl VarLenMembers,
1642        blob_store: &mut dyn BlobStore,
1643        mut filter: impl FnMut(&Page, PageOffset) -> bool,
1644    ) -> ControlFlow<(), PageOffset> {
1645        for row_offset in self
1646            .iter_fixed_len_from(fixed_row_size, starting_from)
1647            // Only copy rows satisfying the predicate `filter`.
1648            .filter(|o| filter(self, *o))
1649        {
1650            // SAFETY:
1651            // - `starting_from` points to a valid row and thus `row_offset` also does.
1652            // - `var_len_visitor` will visit the right `VarLenRef`s and is consistent with other calls.
1653            // - `fixed_row_size` is consistent with `var_len_visitor` and `self`.
1654            if !unsafe { self.copy_row_into(row_offset, dst, fixed_row_size, var_len_visitor, blob_store) } {
1655                // Target doesn't have enough space for row;
1656                // stop here and return the offset of the uncopied row
1657                // so a later call to `copy_filter_into` can start there.
1658                return ControlFlow::Continue(row_offset);
1659            }
1660        }
1661
1662        // The `for` loop completed.
1663        // We successfully copied the entire page of `self` into `target`.
1664        // The caller doesn't need to resume from this offset.
1665        ControlFlow::Break(())
1666    }
1667
1668    /// Copies the row at `row_offset` from `self` into `dst`
1669    /// or returns `false` otherwise if `dst` has no space for the row.
1670    ///
1671    /// # Safety
1672    ///
1673    /// - `row_offset` offset must point to a valid row.
1674    ///
1675    /// - `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1676    ///   as the visitor provided to all other methods on `self` and `dst`.
1677    ///
1678    /// - `fixed_row_size` must be consistent with the `var_len_visitor`,
1679    ///   and be equal to the value provided to all other methods on `self` and `dst`.
1680    unsafe fn copy_row_into(
1681        &self,
1682        row_offset: PageOffset,
1683        dst: &mut Page,
1684        fixed_row_size: Size,
1685        var_len_visitor: &impl VarLenMembers,
1686        blob_store: &mut dyn BlobStore,
1687    ) -> bool {
1688        // SAFETY: Caller promised that `starting_from` points to a valid row
1689        // consistent with `fixed_row_size` which was also
1690        // claimed to be consistent with `var_len_visitor` and `self`.
1691        let required_granules = unsafe { self.row_total_granules(row_offset, fixed_row_size, var_len_visitor) };
1692        if !dst.has_space_for_row(fixed_row_size, required_granules) {
1693            // Target doesn't have enough space for row.
1694            return false;
1695        };
1696
1697        let src_row = self.get_row_data(row_offset, fixed_row_size);
1698
1699        // Allocate for the fixed-len data.
1700        // SAFETY: forward our requirement on `fixed_row_size` to `alloc_fixed_len`.
1701        let inserted_offset = unsafe { dst.alloc_fixed_len(fixed_row_size) }
1702            .expect("Failed to allocate fixed-len row in dst page after checking for available space");
1703
1704        // Copy all fixed-len data. We'll overwrite the var-len parts next.
1705        let (mut dst_fixed, mut dst_var) = dst.split_fixed_var_mut();
1706        let dst_row = dst_fixed.get_row_mut(inserted_offset, fixed_row_size);
1707        dst_row.copy_from_slice(src_row);
1708
1709        // Copy var-len members into target.
1710        // Fixup `VarLenRef`s in `dst_row` to point to the copied var-len objects.
1711        //
1712        // SAFETY: `src_row` is valid because it came from `self.iter_fixed_len_from`.
1713        //
1714        //         Forward our safety requirements re: `var_len_visitor` to `visit_var_len`.
1715        let src_vlr_iter = unsafe { var_len_visitor.visit_var_len(src_row) };
1716        // SAFETY: forward our requirement on `var_len_visitor` to `visit_var_len_mut`.
1717        let target_vlr_iter = unsafe { var_len_visitor.visit_var_len_mut(dst_row) };
1718        for (src_vlr, target_vlr_slot) in src_vlr_iter.zip(target_vlr_iter) {
1719            // SAFETY:
1720            //
1721            // - requirements of `visit_var_len_assume_init` were met,
1722            //   so we can assume that `src_vlr.first_granule` points to a valid granule or is NULL.
1723            //
1724            // - the call to `dst.has_space_for_row` above ensures
1725            //   that the allocation will not fail part-way through.
1726            let target_vlr_fixup = unsafe { self.copy_var_len_into(*src_vlr, &mut dst_var, blob_store) }
1727                .expect("Failed to allocate var-len object in dst page after checking for available space");
1728
1729            *target_vlr_slot = target_vlr_fixup;
1730        }
1731
1732        true
1733    }
1734
1735    /// Copy a var-len object `src_vlr` from `self` into `dst_var`,
1736    /// and return the `VarLenRef` to the copy in `dst_var`.
1737    ///
1738    /// If the `src_vlr` is empty,
1739    /// i.e., has `first_granule.is_null()` and `length_in_bytes == 0`,
1740    /// this will return `VarLenRef::NULL`.
1741    ///
1742    /// # SAFETY:
1743    ///
1744    /// - `src_vlr.first_granule` must point to a valid granule or be NULL.
1745    ///
1746    /// - To avoid leaving dangling uninitialized allocations in `dst_var`,
1747    ///   `dst_var` must already be checked to have enough size to store `src_vlr`
1748    ///   using `Self::has_space_for_row`.
1749    unsafe fn copy_var_len_into(
1750        &self,
1751        src_vlr: VarLenRef,
1752        dst_var: &mut VarView<'_>,
1753        blob_store: &mut dyn BlobStore,
1754    ) -> Result<VarLenRef, Error> {
1755        // SAFETY: Caller promised that `src_vlr.first_granule` points to a valid granule is be NULL.
1756        let mut iter = unsafe { self.iter_var_len_object(src_vlr.first_granule) };
1757
1758        // If the `src_vlr` is empty, don't copy anything, and return null.
1759        let Some(mut src_chunk) = iter.next() else {
1760            debug_assert!(src_vlr.length_in_bytes == 0);
1761            return Ok(VarLenRef::NULL);
1762        };
1763        let mut dst_chunk = dst_var.alloc_granule()?;
1764
1765        let copied_head = dst_chunk;
1766
1767        // Weird-looking iterator so we can put the next-pointer into `copied_chunk`.
1768        for next_src_chunk in iter {
1769            // Allocate space for the next granule so we can initialize it in the next iteration.
1770            let next_dst_chunk = dst_var.alloc_granule()?;
1771            let data = src_chunk.data();
1772            // Initialize `dst_chunk` with data and next-pointer.
1773            //
1774            // SAFETY:
1775            // 1. `dst_chunk` is properly aligned as it came from `alloc_granule` either
1776            //    before the loop or in the previous iteration.
1777            //    This also ensures that both are in bounds
1778            //    of the page for `granule + granule + VarLenGranule::SIZE`.
1779            //
1780            // 2. `next_dst_chunk` will be initialized
1781            //    either in the next iteration or after the loop ends.
1782            //
1783            // 3. `dst_chunk` points to unused data as the space was allocated before the loop
1784            //    or was `next_dst_chunk` in the previous iteration and hasn't been written to yet.
1785            unsafe { dst_var.write_chunk_to_granule(data, data.len(), dst_chunk, next_dst_chunk) };
1786            dst_chunk = next_dst_chunk;
1787            src_chunk = next_src_chunk;
1788        }
1789
1790        let data = src_chunk.data();
1791        // The last granule has null as next-pointer.
1792        //
1793        // SAFETY:
1794        // 1. `dst_chunk` is properly aligned as it came from `alloc_granule` either
1795        //    before the loop or in the previous iteration.
1796        //    This also ensures that both are in bounds
1797        //    of the page for `granule + granule + VarLenGranule::SIZE`.
1798        //
1799        // 2. `next` is NULL which is trivially init.
1800        //
1801        // 3. `dst_chunk` points to unused data as the space was allocated before the loop
1802        //    or was `next_dst_chunk` in the previous iteration and hasn't been written to yet.
1803        unsafe { dst_var.write_chunk_to_granule(data, data.len(), dst_chunk, PageOffset::VAR_LEN_NULL) };
1804
1805        // For a large blob object,
1806        // notify the `blob_store` that we've taken a reference to the blob hash.
1807        if src_vlr.is_large_blob() {
1808            blob_store
1809                .clone_blob(&src_chunk.blob_hash())
1810                .expect("blob_store could not mark hash as used");
1811        }
1812
1813        Ok(VarLenRef {
1814            first_granule: copied_head,
1815            length_in_bytes: src_vlr.length_in_bytes,
1816        })
1817    }
1818
1819    /// Make `self` empty, removing all rows from it and resetting the high water marks to zero.
1820    ///
1821    /// This also clears the `unmodified_hash`.
1822    pub fn clear(&mut self) {
1823        self.header.clear();
1824    }
1825
1826    /// Zeroes every byte of row data in this page.
1827    ///
1828    /// # Safety
1829    ///
1830    /// Causes the page header to no longer match the contents, invalidating many assumptions.
1831    /// Should be called in conjunction with [`Self::clear`].
1832    pub unsafe fn zero_data(&mut self) {
1833        self.row_data.fill(0);
1834    }
1835
1836    /// Resets this page for reuse of its allocation.
1837    ///
1838    /// The reset page supports `max_rows_in_page` at most.
1839    pub fn reset_for(&mut self, max_rows_in_page: usize) {
1840        self.header.reset_for(max_rows_in_page);
1841
1842        // NOTE(centril): We previously zeroed pages when resetting.
1843        // This had an adverse performance impact.
1844        // The reason why we previously zeroed was for security under a multi-tenant setup
1845        // when exposing a module ABI that allows modules to memcpy whole pages over.
1846        // However, we have no such ABI for the time being, so we can soundly avoid zeroing.
1847        // If we ever decide to add such an ABI, we must start zeroing again.
1848        //
1849        // // SAFETY: We just reset the page header.
1850        // unsafe { self.zero_data() };
1851    }
1852
1853    /// Sets the header and the row data.
1854    ///
1855    /// # Safety
1856    ///
1857    /// The `header` and `row_data` must be consistent with each other.
1858    pub(super) unsafe fn set_raw(&mut self, header: PageHeader, row_data: RowData) {
1859        self.header = header;
1860        self.row_data = row_data;
1861    }
1862
1863    /// Returns the page header, for testing.
1864    #[cfg(test)]
1865    pub(super) fn page_header_for_test(&self) -> &PageHeader {
1866        &self.header
1867    }
1868
1869    /// Computes the content hash of this page.
1870    pub fn content_hash(&self) -> blake3::Hash {
1871        let mut hasher = blake3::Hasher::new();
1872
1873        // Hash the page contents.
1874        hasher.update(&self.row_data);
1875
1876        // Hash the `FixedHeader`, first copy out the fixed part save for the bitset into an array.
1877        let fixed = &self.header.fixed;
1878        let mut fixed_bytes = [0u8; 6];
1879        fixed_bytes[0..2].copy_from_slice(&fixed.next_free.next.0.to_le_bytes());
1880        fixed_bytes[2..4].copy_from_slice(&fixed.last.0.to_le_bytes());
1881        fixed_bytes[4..6].copy_from_slice(&fixed.num_rows.to_le_bytes());
1882        hasher.update(&fixed_bytes);
1883
1884        // Hash the fixed bit set.
1885        hasher.update(bytemuck::must_cast_slice(fixed.present_rows.storage()));
1886
1887        // Hash the `VarHeader`.
1888        hasher.update(bytemuck::bytes_of(&self.header.var));
1889
1890        // We're done.
1891        // Note that `unmodified_hash` itself must not be hashed to avoid a recursive dependency.
1892        hasher.finalize()
1893    }
1894
1895    /// Computes the content hash of this page and saves it to [`PageHeader::unmodified_hash`].
1896    pub fn save_content_hash(&mut self) {
1897        let hash = self.content_hash();
1898        self.header.unmodified_hash = Some(hash);
1899    }
1900
1901    /// Return the page's content hash, computing and saving it if it is not already stored.
1902    pub fn save_or_get_content_hash(&mut self) -> blake3::Hash {
1903        self.unmodified_hash().copied().unwrap_or_else(|| {
1904            self.save_content_hash();
1905            self.header.unmodified_hash.unwrap()
1906        })
1907    }
1908
1909    /// Returns the stored unmodified hash, if any.
1910    pub fn unmodified_hash(&self) -> Option<&blake3::Hash> {
1911        self.header.unmodified_hash.as_ref()
1912    }
1913}
1914
1915/// An iterator over the `PageOffset`s of all present fixed-length rows in a [`Page`].
1916pub struct FixedLenRowsIter<'page> {
1917    /// The fixed header of the page,
1918    /// used to determine where the last fixed row is
1919    /// and whether the fixed row slot is actually a fixed row.
1920    idx_iter: IterSet<'page>,
1921    /// The size of a row in bytes.
1922    fixed_row_size: Size,
1923}
1924
1925impl Iterator for FixedLenRowsIter<'_> {
1926    type Item = PageOffset;
1927
1928    fn next(&mut self) -> Option<Self::Item> {
1929        self.idx_iter
1930            .next()
1931            .map(|idx| PageOffset(idx as u16 * self.fixed_row_size.0))
1932    }
1933}
1934
1935/// An iterator over the [`VarLenGranule`]s in a particular [`VarLenRef`] in `page`.
1936///
1937/// Constructing a `VarLenGranulesIter` should be considered unsafe
1938/// because the initial `next_granule` must either be `NULL` or point to a valid [`VarLenGranule`].
1939///
1940/// Iterating over [`VarLenRef::NULL`] is safe and will immediately return `None`.
1941#[derive(Clone, Copy)]
1942struct VarLenGranulesIter<'page> {
1943    /// The page to yield granules from.
1944    page: &'page Page,
1945    /// Location of the next granule in `page`.
1946    /// Must either be `NULL` or point to a valid granule.
1947    next_granule: PageOffset,
1948    // TODO(perf,bikeshedding): store length and implement `Iterator::size_hint`?
1949}
1950
1951impl<'page> Iterator for VarLenGranulesIter<'page> {
1952    type Item = &'page VarLenGranule;
1953
1954    fn next(&mut self) -> Option<Self::Item> {
1955        if self.next_granule.is_var_len_null() {
1956            return None;
1957        }
1958
1959        // SAFETY: By construction,
1960        // the initial `next_granule` was promised to either be `NULL` or point to a valid granule.
1961        // For a given granule, the same applies to its `.next()` granule.
1962        // At this point, we've excluded `NULL`,
1963        // so we know inductively that `next_granule` points to a valid granule, as required.
1964        let granule: &VarLenGranule = unsafe { get_ref(&self.page.row_data, self.next_granule) };
1965        self.next_granule = granule.header.next();
1966
1967        Some(granule)
1968    }
1969}
1970
1971#[cfg(test)]
1972pub(crate) mod tests {
1973    use super::*;
1974    use crate::{blob_store::NullBlobStore, page_pool::PagePool, var_len::AlignedVarLenOffsets};
1975    use proptest::{collection::vec, prelude::*};
1976    use spacetimedb_sats::bsatn;
1977    use spacetimedb_sats::layout::row_size_for_type;
1978
1979    fn u64_row_size() -> Size {
1980        let fixed_row_size = row_size_for_type::<u64>();
1981        assert_eq!(fixed_row_size.len(), 8);
1982        fixed_row_size
1983    }
1984
1985    const U64_VL_VISITOR: AlignedVarLenOffsets<'_> = AlignedVarLenOffsets::from_offsets(&[]);
1986    fn u64_var_len_visitor() -> &'static AlignedVarLenOffsets<'static> {
1987        &U64_VL_VISITOR
1988    }
1989
1990    fn insert_u64(page: &mut Page, val: u64) -> PageOffset {
1991        let val_slice = val.to_le_bytes();
1992        unsafe { page.insert_row(&val_slice, &[] as &[&[u8]], u64_var_len_visitor(), &mut NullBlobStore) }
1993            .expect("Failed to insert first row")
1994    }
1995
1996    fn insert_u64_drop(page: &mut Page, val: u64) {
1997        insert_u64(page, val);
1998    }
1999
2000    fn read_u64(page: &Page, offset: PageOffset) -> u64 {
2001        let row = page.get_row_data(offset, u64_row_size());
2002        u64::from_le_bytes(row.try_into().unwrap())
2003    }
2004
2005    fn data_sub_n_vlg(n: usize) -> usize {
2006        PageOffset::PAGE_END.idx() - (VarLenGranule::SIZE * n).len()
2007    }
2008
2009    pub(crate) fn hash_unmodified_save_get(page: &mut Page) -> blake3::Hash {
2010        assert_eq!(page.header.unmodified_hash, None);
2011        page.save_content_hash();
2012        page.header.unmodified_hash.unwrap()
2013    }
2014
2015    #[test]
2016    fn insert_one_u64() {
2017        let mut page = Page::new(u64_row_size());
2018
2019        // First the hash is not saved, so compute it.
2020        let hash = hash_unmodified_save_get(&mut page);
2021
2022        let val: u64 = 0xa5a5_a5a5_a5a5_a5a5;
2023
2024        let offset = insert_u64(&mut page, val);
2025
2026        assert_eq!(offset.idx(), 0);
2027
2028        let row_val = read_u64(&page, offset);
2029
2030        assert_eq!(row_val, val);
2031
2032        // The hash should have been cleared.
2033        assert_ne!(hash, hash_unmodified_save_get(&mut page));
2034    }
2035
2036    fn insert_while(
2037        page: &mut Page,
2038        mut next_val: u64,
2039        fixed_row_size: Size,
2040        vl_num: usize,
2041        mut insert: impl FnMut(&mut Page, u64),
2042    ) -> u64 {
2043        while page.has_space_for_row(fixed_row_size, vl_num) {
2044            insert(page, next_val);
2045            next_val += 1;
2046        }
2047        next_val
2048    }
2049
2050    #[test]
2051    fn fill_then_iter_fixed_len_u64() {
2052        let mut page = Page::new(u64_row_size());
2053
2054        let last_val = insert_while(&mut page, 0, u64_row_size(), 0, insert_u64_drop);
2055        assert_eq!(last_val, (PageOffset::PAGE_END / u64_row_size()) as u64);
2056
2057        for (row_idx, expected_val) in page.iter_fixed_len(u64_row_size()).zip(0..last_val) {
2058            let row_val = read_u64(&page, row_idx);
2059            assert_eq!(
2060                row_val, expected_val,
2061                "row_val {row_val:x} /= expected_val {expected_val:x}"
2062            );
2063        }
2064    }
2065
2066    #[test]
2067    fn fill_delete_iter_fixed_len_u64() {
2068        let mut page = Page::new(u64_row_size());
2069
2070        // First the hash is not saved, so compute it.
2071        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2072
2073        // Insert rows.
2074        let mut odds: Vec<PageOffset> = Vec::new();
2075        let last_val = insert_while(&mut page, 2, u64_row_size(), 0, |page, val| {
2076            let offset = insert_u64(page, val);
2077            if val % 2 == 1 {
2078                odds.push(offset);
2079            }
2080        });
2081
2082        // The hash should have been cleared.
2083        let hash_pre_del = hash_unmodified_save_get(&mut page);
2084        assert_ne!(hash_pre_ins, hash_pre_del);
2085
2086        // Delete rows.
2087        for row_offset in odds {
2088            unsafe { page.delete_row(row_offset, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2089        }
2090
2091        // The hash should have been cleared.
2092        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2093        assert_ne!(hash_pre_ins, hash_pre_iter);
2094        assert_ne!(hash_pre_del, hash_pre_iter);
2095
2096        // Iterate the rows.
2097        for (row_offset, expected_val) in page.iter_fixed_len(u64_row_size()).zip((2..last_val).step_by(2)) {
2098            let found_val = read_u64(&page, row_offset);
2099            assert_eq!(found_val, expected_val);
2100        }
2101
2102        // Hash is unchanged.
2103        assert_eq!(page.header.unmodified_hash, Some(hash_pre_iter));
2104    }
2105
2106    #[test]
2107    /// After deleting a fixed-length row and then inserting a new fixed-length row,
2108    /// the fixed-length high water mark must not change,
2109    /// i.e. we must re-use memory from the deleted row to store the new insertion.
2110    fn reuse_fixed_len_space() {
2111        let mut page = Page::new(u64_row_size());
2112
2113        // First the hash is not saved, so compute it.
2114        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2115
2116        // Insert two rows.
2117        let offset_0 = insert_u64(&mut page, 0xa5a5a5a5_a5a5a5a5);
2118        assert_eq!(offset_0.idx(), 0);
2119        let offset_1 = insert_u64(&mut page, 0xbeefbeef_beefbeef);
2120        assert_eq!(offset_1, u64_row_size());
2121
2122        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2123
2124        // Hash has been cleared after inserts.
2125        let hash_pre_del = hash_unmodified_save_get(&mut page);
2126        assert_ne!(hash_pre_ins, hash_pre_del);
2127
2128        // Delete first row.
2129        unsafe { page.delete_row(offset_0, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2130
2131        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2132
2133        // Hash has been cleared after deletes.
2134        let hash_pre_ins2 = hash_unmodified_save_get(&mut page);
2135        assert_ne!(hash_pre_ins, hash_pre_ins2);
2136        assert_ne!(hash_pre_del, hash_pre_ins2);
2137
2138        // Insert first row again, re-using memory.
2139        let offset_0_again = insert_u64(&mut page, 0xffffffff_ffffffff);
2140
2141        assert_eq!(offset_0_again.idx(), 0);
2142        assert_eq!(offset_0.idx(), offset_0_again.idx());
2143
2144        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2145
2146        // Hash has been cleared after last insert, despite re-using memory.
2147        let hash_post_ins2 = hash_unmodified_save_get(&mut page);
2148        assert_ne!(hash_pre_ins, hash_post_ins2);
2149        assert_ne!(hash_pre_del, hash_post_ins2);
2150        assert_ne!(hash_pre_ins2, hash_post_ins2);
2151    }
2152
2153    const STR_ROW_SIZE: Size = row_size_for_type::<VarLenRef>();
2154
2155    const _: () = assert!(STR_ROW_SIZE.len() == mem::size_of::<VarLenRef>());
2156
2157    const STR_VL_VISITOR: AlignedVarLenOffsets<'_> = AlignedVarLenOffsets::from_offsets(&[0]);
2158    fn str_var_len_visitor() -> &'static AlignedVarLenOffsets<'static> {
2159        &STR_VL_VISITOR
2160    }
2161
2162    fn insert_str(page: &mut Page, data: &[u8]) -> PageOffset {
2163        let fixed_len_data = [0u8; STR_ROW_SIZE.len()];
2164        unsafe { page.insert_row(&fixed_len_data, &[data], str_var_len_visitor(), &mut NullBlobStore) }
2165            .expect("Failed to insert row")
2166    }
2167
2168    fn read_str_ref(page: &Page, offset: PageOffset) -> VarLenRef {
2169        *unsafe { get_ref(&page.row_data, offset) }
2170    }
2171
2172    #[test]
2173    fn insert_empty_str() {
2174        let mut page = Page::new(STR_ROW_SIZE);
2175
2176        // First the hash is not saved, so compute it.
2177        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2178
2179        // Insert the empty string.
2180        let offset = insert_str(&mut page, &[]);
2181
2182        // No granules were used.
2183        let extracted = read_str_ref(&page, offset);
2184        let mut granules_iter = unsafe { page.iter_var_len_object(extracted.first_granule) };
2185        assert!(granules_iter.next().is_none());
2186        drop(granules_iter);
2187
2188        // Hash is cleared even though the string was empty.
2189        assert_ne!(hash_pre_ins, hash_unmodified_save_get(&mut page));
2190    }
2191
2192    proptest! {
2193        #[test]
2194        fn insert_one_short_str(data in vec(any::<u8>(), 1..VarLenGranule::DATA_SIZE)) {
2195            let mut page = Page::new(STR_ROW_SIZE);
2196
2197            // First the hash is not saved, so compute it.
2198            let hash_pre_ins = hash_unmodified_save_get(&mut page);
2199
2200            // Insert the row.
2201            let offset = insert_str(&mut page, &data);
2202
2203            // Hash was cleared by the insert.
2204            let hash_pre_iter = hash_unmodified_save_get(&mut page);
2205            assert_ne!(hash_pre_ins, hash_pre_iter);
2206
2207            // Confirm we inserted correctly.
2208            let extracted = read_str_ref(&page, offset);
2209            let mut data_iter = unsafe { page.iter_vlo_data(extracted.first_granule) };
2210            let (first, next) = (data_iter.next(), data_iter.next());
2211            assert_eq!(first, Some(&*data));
2212            assert_eq!(next, None);
2213
2214            // Iteration and reading did not change the hash.
2215            assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2216        }
2217
2218        #[test]
2219        fn insert_one_long_str(data in vec(any::<u8>(), (VarLenGranule::OBJECT_SIZE_BLOB_THRESHOLD / 2)..VarLenGranule::OBJECT_SIZE_BLOB_THRESHOLD)) {
2220            let mut page = Page::new(STR_ROW_SIZE);
2221
2222            // First the hash is not saved, so compute it.
2223            let hash_pre_ins = hash_unmodified_save_get(&mut page);
2224
2225            // Insert the long string.
2226            let offset = insert_str(&mut page, &data);
2227
2228            // The hash was cleared, and the new one is different.
2229            let hash_post_ins = hash_unmodified_save_get(&mut page);
2230            assert_ne!(hash_pre_ins, hash_post_ins);
2231
2232            // Check that we inserted correctly.
2233            let extracted = read_str_ref(&page, offset);
2234
2235            let mut data_iter = unsafe { page.iter_vlo_data(extracted.first_granule) };
2236            let mut chunks_iter = data.chunks(VarLenGranule::DATA_SIZE);
2237
2238            for (i, (data, chunk)) in (&mut data_iter).zip(&mut chunks_iter).enumerate() {
2239                assert_eq!(
2240                    data,
2241                    chunk,
2242                    "Chunk {i} does not match. Left is found, right is expected.",
2243                );
2244            }
2245
2246            // Both iterators must be finished, i.e. they must have the same length.
2247            assert!(data_iter.next().is_none());
2248            assert!(chunks_iter.next().is_none());
2249
2250            // Reading did not alter the hash.
2251            assert_eq!(hash_post_ins, page.header.unmodified_hash.unwrap());
2252        }
2253    }
2254
2255    #[test]
2256    fn reuse_var_len_space_no_fragmentation_concerns() {
2257        let data_0 = b"Hello, world!";
2258        let data_1 = b"How goes life?";
2259        let data_2 = b"Glad to hear it.";
2260
2261        let mut page = Page::new(STR_ROW_SIZE);
2262        let offset_0 = insert_str(&mut page, data_0);
2263        let offset_1 = insert_str(&mut page, data_1);
2264
2265        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2266
2267        assert_ne!(offset_0.idx(), offset_1.idx());
2268
2269        let var_len_0 = read_str_ref(&page, offset_0);
2270
2271        assert_eq!(var_len_0.length_in_bytes as usize, data_0.len());
2272        assert_eq!(var_len_0.first_granule.idx(), data_sub_n_vlg(1));
2273
2274        let var_len_1 = read_str_ref(&page, offset_1);
2275
2276        assert_eq!(var_len_1.length_in_bytes as usize, data_1.len());
2277        assert_eq!(var_len_1.first_granule.idx(), data_sub_n_vlg(2));
2278
2279        let hash_pre_del = hash_unmodified_save_get(&mut page);
2280
2281        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2282
2283        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2284
2285        let offset_2 = insert_str(&mut page, data_2);
2286
2287        let hash_post_ins = hash_unmodified_save_get(&mut page);
2288        assert_ne!(hash_pre_del, hash_pre_ins);
2289        assert_ne!(hash_pre_del, hash_post_ins);
2290        assert_ne!(hash_pre_ins, hash_post_ins);
2291
2292        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2293
2294        assert_eq!(offset_0.idx(), offset_2.idx());
2295
2296        let var_len_2 = read_str_ref(&page, offset_2);
2297
2298        assert_eq!(var_len_2.length_in_bytes as usize, data_2.len());
2299        assert_eq!(var_len_2.first_granule.idx(), var_len_0.first_granule.idx());
2300    }
2301
2302    #[test]
2303    fn free_var_len_obj_multiple_granules() {
2304        let mut page = Page::new(STR_ROW_SIZE);
2305
2306        // Allocate a 4-granule var-len object.
2307        let data_0 = [0xa5u8].repeat(VarLenGranule::DATA_SIZE * 4);
2308        let offset_0 = insert_str(&mut page, &data_0);
2309
2310        let var_len_0 = read_str_ref(&page, offset_0);
2311
2312        // Read the addresses of its var-len granules.
2313        let granules_0 = unsafe { page.iter_var_len_object(var_len_0.first_granule) }
2314            .map(|granule| granule as *const VarLenGranule as usize)
2315            .collect::<Vec<_>>();
2316
2317        // Sanity checks: we have allocated 4 granules.
2318        assert_eq!(granules_0.len(), 4);
2319        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(4));
2320
2321        // Delete the row.
2322        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2323
2324        // Allocate a new 4-granule var-len object.
2325        // This should use the same storage as the original row.
2326        let data_1 = [0xffu8].repeat(VarLenGranule::DATA_SIZE * 4);
2327        let offset_1 = insert_str(&mut page, &data_1);
2328
2329        let var_len_1 = read_str_ref(&page, offset_1);
2330
2331        // Read the addresses of the new allocation's var-len granules.
2332        let granules_1 = unsafe { page.iter_var_len_object(var_len_1.first_granule) }
2333            .map(|granule| granule as *const VarLenGranule as usize)
2334            .collect::<Vec<_>>();
2335
2336        // Sanity check: the new allocation is also 4 granules.
2337        assert_eq!(granules_1.len(), 4);
2338
2339        for granule in granules_1.iter().copied() {
2340            // The new var-len allocation must contain all the same granules by address
2341            // as the old var-len allocation.
2342            assert!(granules_0.iter().copied().any(|other_granule| other_granule == granule));
2343        }
2344
2345        // The var-len high water mark must not have moved.
2346        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(4));
2347    }
2348
2349    #[test]
2350    fn reuse_var_len_space_avoid_fragmentation() {
2351        let data_0 = &[0xa5u8];
2352        let data_1 = &[0xffu8];
2353        let data_2 = [0x11u8].repeat(VarLenGranule::DATA_SIZE + 1);
2354        let data_2 = data_2.as_ref();
2355
2356        let mut page = Page::new(STR_ROW_SIZE);
2357
2358        // First the hash is not saved, so compute it.
2359        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2360
2361        // Insert two string rows.
2362        let offset_0 = insert_str(&mut page, data_0);
2363        let _offset_1 = insert_str(&mut page, data_1);
2364
2365        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2366
2367        // Hash is cleared by inserting and the new one is different.
2368        let hash_pre_del = hash_unmodified_save_get(&mut page);
2369        assert_ne!(hash_pre_ins, hash_pre_del);
2370
2371        // Delete the first row.
2372        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2373
2374        // Hash is cleared by deleting.
2375        let hash_post_del = hash_unmodified_save_get(&mut page);
2376        assert_ne!(hash_pre_ins, hash_post_del);
2377        assert_ne!(hash_pre_del, hash_post_del);
2378
2379        // Insert again, re-using memory.
2380        let offset_2 = insert_str(&mut page, data_2);
2381
2382        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(3));
2383
2384        // Hash is cleared by inserting again, even though we re-used memory.
2385        let hash_post_ins2 = hash_unmodified_save_get(&mut page);
2386        assert_ne!(hash_pre_ins, hash_post_ins2);
2387        assert_ne!(hash_pre_del, hash_post_ins2);
2388        assert_ne!(hash_post_del, hash_post_ins2);
2389
2390        // Check that we inserted correctly.
2391        let var_len_2 = read_str_ref(&page, offset_2);
2392
2393        let mut data_iter = unsafe { page.iter_vlo_data(var_len_2.first_granule) };
2394        let mut chunks_iter = data_2.chunks(VarLenGranule::DATA_SIZE);
2395
2396        for (i, (data, chunk)) in (&mut data_iter).zip(&mut chunks_iter).enumerate() {
2397            assert_eq!(
2398                data, chunk,
2399                "Chunk {i} does not match. Left is found, right is expected.",
2400            );
2401        }
2402
2403        // Both iterators must be finished, i.e. they must have the same length.
2404        assert!(data_iter.next().is_none());
2405        assert!(chunks_iter.next().is_none());
2406    }
2407
2408    fn check_u64_in_str(page: &Page, row_idx: PageOffset, expected_val: u64) {
2409        let vlr = read_str_ref(page, row_idx);
2410
2411        let mut var_len_iter = unsafe { page.iter_vlo_data(vlr.first_granule) };
2412        let data = var_len_iter.next().unwrap();
2413        assert!(var_len_iter.next().is_none());
2414        assert_eq!(data.len(), mem::size_of::<u64>());
2415
2416        let val = u64::from_le_bytes(data.try_into().unwrap());
2417        assert_eq!(val, expected_val);
2418    }
2419
2420    #[test]
2421    fn fill_then_iter_var_len_str() {
2422        let mut page = Page::new(STR_ROW_SIZE);
2423
2424        // First the hash is not saved, so compute it.
2425        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2426
2427        // Insert the strings.
2428        let last_val = insert_while(&mut page, 0, STR_ROW_SIZE, 1, |page, val| {
2429            insert_str(page, &val.to_le_bytes());
2430        });
2431
2432        // Hash is cleared by inserting and the new one is different.
2433        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2434        assert_ne!(hash_pre_ins, hash_pre_iter);
2435
2436        // Check that we inserted correctly.
2437        let size_per_row = STR_ROW_SIZE + VarLenGranule::SIZE;
2438
2439        assert_eq!(last_val, (PageOffset::PAGE_END / size_per_row) as u64);
2440
2441        for (row_idx, expected_val) in page.iter_fixed_len(STR_ROW_SIZE).zip(0..last_val) {
2442            check_u64_in_str(&page, row_idx, expected_val);
2443        }
2444
2445        // Reading does not alter the hash.
2446        assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2447    }
2448
2449    #[test]
2450    fn fill_delete_iter_var_len_str() {
2451        let mut page = Page::new(STR_ROW_SIZE);
2452
2453        // First the hash is not saved, so compute it.
2454        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2455
2456        // Insert the string rows.
2457        let mut odds = Vec::new();
2458        let last_val = insert_while(&mut page, 0, STR_ROW_SIZE, 1, |page, val| {
2459            let offset = insert_str(page, &val.to_le_bytes());
2460            if val % 2 == 1 {
2461                odds.push(offset);
2462            }
2463        });
2464
2465        let size_per_row = STR_ROW_SIZE + VarLenGranule::SIZE;
2466        let num_rows_inserted = (PageOffset::PAGE_END / size_per_row) as u64;
2467        assert_eq!(last_val, num_rows_inserted);
2468
2469        // Hash was cleared by inserting and is different now.
2470        let hash_pre_del = hash_unmodified_save_get(&mut page);
2471        assert_ne!(hash_pre_ins, hash_pre_del);
2472
2473        // Delete the rows.
2474        for row_offset in odds {
2475            unsafe { page.delete_row(row_offset, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2476        }
2477
2478        // Hash was cleared by deleting and is different now.
2479        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2480        assert_ne!(hash_pre_ins, hash_pre_iter);
2481        assert_ne!(hash_pre_del, hash_pre_iter);
2482
2483        // Check that we deleted correctly.
2484        let num_rows_retained = num_rows_inserted.div_ceil(2);
2485        let num_rows_removed = num_rows_inserted / 2;
2486
2487        assert_eq!(page.header.fixed.num_rows as u64, num_rows_retained);
2488
2489        assert_eq!(page.header.var.freelist_len as u64, num_rows_removed);
2490
2491        for (row_idx, expected_val) in page.iter_fixed_len(STR_ROW_SIZE).zip((0..last_val).step_by(2)) {
2492            check_u64_in_str(&page, row_idx, expected_val);
2493        }
2494
2495        // Reading did not alter the hash.
2496        assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2497    }
2498
2499    #[test]
2500    fn serde_round_trip_whole_page() {
2501        let pool = PagePool::new_for_test();
2502        let mut page = Page::new(u64_row_size());
2503
2504        // Construct an empty page, ser/de it, and assert that it's still empty.
2505        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2506        let ser_pre_ins = bsatn::to_vec(&page).unwrap();
2507        let de_pre_ins = pool.take_deserialize_from(&ser_pre_ins).unwrap();
2508        assert_eq!(de_pre_ins.content_hash(), hash_pre_ins);
2509        assert_eq!(de_pre_ins.header.fixed.num_rows, 0);
2510        assert!(de_pre_ins.header.fixed.present_rows == page.header.fixed.present_rows);
2511
2512        // Insert some rows into the page.
2513        let offsets = (0..64)
2514            .map(|val| insert_u64(&mut page, val))
2515            .collect::<Vec<PageOffset>>();
2516
2517        let hash_ins = hash_unmodified_save_get(&mut page);
2518
2519        // Ser/de the page and assert that it contains the same rows.
2520        let ser_ins = bsatn::to_vec(&page).unwrap();
2521        let de_ins = pool.take_deserialize_from(&ser_ins).unwrap();
2522        assert_eq!(de_ins.content_hash(), hash_ins);
2523        assert_eq!(de_ins.header.fixed.num_rows, 64);
2524        assert!(de_ins.header.fixed.present_rows == page.header.fixed.present_rows);
2525        assert_eq!(
2526            de_ins.iter_fixed_len(u64_row_size()).collect::<Vec<PageOffset>>(),
2527            offsets
2528        );
2529
2530        // Delete the even-numbered rows, leaving the odds.
2531        let offsets = offsets
2532            .into_iter()
2533            .enumerate()
2534            .filter_map(|(i, offset)| {
2535                if i % 2 == 0 {
2536                    unsafe { page.delete_row(offset, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2537                    None
2538                } else {
2539                    Some(offset)
2540                }
2541            })
2542            .collect::<Vec<PageOffset>>();
2543
2544        // Ser/de the page again and assert that it contains only the odd-numbered rows.
2545        let hash_del = hash_unmodified_save_get(&mut page);
2546        let ser_del = bsatn::to_vec(&page).unwrap();
2547        let de_del = pool.take_deserialize_from(&ser_del).unwrap();
2548        assert_eq!(de_del.content_hash(), hash_del);
2549        assert_eq!(de_del.header.fixed.num_rows, 32);
2550        assert!(de_del.header.fixed.present_rows == page.header.fixed.present_rows);
2551        assert_eq!(
2552            de_del.iter_fixed_len(u64_row_size()).collect::<Vec<PageOffset>>(),
2553            offsets
2554        );
2555    }
2556}