spacetimedb_table/
page.rs

1//! Provides a [`Page`] abstraction that stores rows
2//! and an associated header necessary for the page to work.
3//! Consult the documentation of this type for a list of operations
4//! and a description of how page work.
5//!
6//! A page can provide a split mutable view of its fixed section and its variable section.
7//! This is provided through [`Page::split_fixed_var_mut`] with view operations
8//! defined on [`FixedView`] and [`VarView`].
9//!
10//! [ralfj_safe_valid]: https://www.ralfj.de/blog/2018/08/22/two-kinds-of-invariants.html
11//!
12//! Technical terms:
13//!
14//! - `valid` refers to, when referring to a type, granule, or row,
15//!    depending on the context, a memory location that holds a *safe* object.
16//!    When "valid for writes" is used, the location must be properly aligned
17//!    and none of its bytes may be uninit,
18//!    but the value need not be valid at the type in question.
19//!    "Valid for writes" is equivalent to valid-unconstrained.
20//!
21//! - `valid-unconstrained`, when referring to a memory location with a given type,
22//!    that the location stores a byte pattern which Rust/LLVM's memory model recognizes as valid,
23//!    and therefore must not contain any uninit,
24//!    but the value is not required to be logically meaningful,
25//!    and no code may depend on the data within it to uphold any invariants.
26//!    E.g. an unallocated [`VarLenGranule`] within a page stores valid-unconstrained bytes,
27//!    because the bytes are either 0 from the initial [`alloc_zeroed`] of the page,
28//!    or contain stale data from a previously freed [`VarLenGranule`].
29//!
30//! - `unused` means that it is safe to overwrite a block of memory without cleaning up its previous value.
31//!
32//!    See the post [Two Kinds of Invariants: Safety and Validity][ralf_safe_valid]
33//!    for a discussion on safety and validity invariants.
34
35use super::{
36    blob_store::BlobStore,
37    fixed_bit_set::FixedBitSet,
38    indexes::{Byte, Bytes, PageOffset, Size, PAGE_HEADER_SIZE, PAGE_SIZE},
39    layout::MIN_ROW_SIZE,
40    var_len::{is_granule_offset_aligned, VarLenGranule, VarLenGranuleHeader, VarLenMembers, VarLenRef},
41};
42use crate::{fixed_bit_set::IterSet, indexes::max_rows_in_page, static_assert_size, table::BlobNumBytes, MemoryUsage};
43use core::{mem, ops::ControlFlow};
44use spacetimedb_sats::{de::Deserialize, ser::Serialize};
45use thiserror::Error;
46
47#[derive(Error, Debug, PartialEq, Eq)]
48pub enum Error {
49    #[error("Want to allocate a var-len object of {need} granules, but have only {have} granules available")]
50    InsufficientVarLenSpace { need: u16, have: u16 },
51    #[error("Want to allocate a fixed-len row of {} bytes, but the page is full", need.len())]
52    InsufficientFixedLenSpace { need: Size },
53}
54
55/// A cons-cell in a freelist either
56/// for an unused fixed-len cell or a variable-length granule.
57#[repr(C)] // Required for a stable ABI.
58#[derive(Clone, Copy, Debug, PartialEq, Eq, bytemuck::NoUninit, Serialize, Deserialize)]
59struct FreeCellRef {
60    /// The address of the next free cell in a freelist.
61    ///
62    /// The `PageOffset::PAGE_END` is used as a sentinel to signal "`None`".
63    next: PageOffset,
64}
65
66impl MemoryUsage for FreeCellRef {
67    fn heap_usage(&self) -> usize {
68        let Self { next } = self;
69        next.heap_usage()
70    }
71}
72
73impl FreeCellRef {
74    /// The sentinel for NULL cell references.
75    const NIL: Self = Self {
76        next: PageOffset::PAGE_END,
77    };
78
79    /// Replaces the cell reference with `offset`, returning the existing one.
80    #[inline]
81    fn replace(&mut self, offset: PageOffset) -> FreeCellRef {
82        let next = mem::replace(&mut self.next, offset);
83        Self { next }
84    }
85
86    /// Returns whether the cell reference is non-empty.
87    #[inline]
88    const fn has(&self) -> bool {
89        !self.next.is_at_end()
90    }
91
92    /// Take the first free cell in the freelist starting with `self`, if any,
93    /// and promote the second free cell as the freelist head.
94    ///
95    /// # Safety
96    ///
97    /// When `self.has()`, it must point to a valid `FreeCellRef`.
98    #[inline]
99    unsafe fn take_freelist_head(
100        self: &mut FreeCellRef,
101        row_data: &Bytes,
102        adjust_free: impl FnOnce(PageOffset) -> PageOffset,
103    ) -> Option<PageOffset> {
104        self.has().then(|| {
105            let head = adjust_free(self.next);
106            // SAFETY: `self.next` so `head` points to a valid `FreeCellRef`.
107            let next = unsafe { get_ref(row_data, head) };
108            self.replace(*next).next
109        })
110    }
111
112    /// Prepend `new_head` to the freelist starting with `self`.
113    ///
114    /// SAFETY: `new_head`, after adjustment, must be in bounds of `row_data`.
115    /// Moreover, it must be valid for writing a `FreeCellRef` to it,
116    /// which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
117    /// Additionally, `self` must contain a valid `FreeCellRef`.
118    #[inline]
119    unsafe fn prepend_freelist(
120        self: &mut FreeCellRef,
121        row_data: &mut Bytes,
122        new_head: PageOffset,
123        adjust_free: impl FnOnce(PageOffset) -> PageOffset,
124    ) {
125        let next = self.replace(new_head);
126        let new_head = adjust_free(new_head);
127        // SAFETY: Per caller contract, `new_head` is in bounds of `row_data`.
128        // Moreover, `new_head` points to an unused `FreeCellRef`, so we can write to it.
129        let next_slot: &mut FreeCellRef = unsafe { get_mut(row_data, new_head) };
130        *next_slot = next;
131    }
132}
133
134/// All the fixed size header information.
135#[repr(C)] // Required for a stable ABI.
136#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
137struct FixedHeader {
138    /// A pointer to the head of the freelist which stores
139    /// all the unused (freed) fixed row cells.
140    /// These cells can be reused when inserting a new row.
141    next_free: FreeCellRef,
142
143    /// High water mark (HWM) for fixed-length rows.
144    /// Points one past the last-allocated (highest-indexed) fixed-length row,
145    /// so to allocate a fixed-length row from the gap,
146    /// post-increment this index.
147    // TODO(perf,future-work): determine how to lower the high water mark when freeing the topmost row.
148    last: PageOffset,
149
150    /// The number of rows currently in the page.
151    ///
152    /// N.B. this is not the same as `self.present_rows.len()`
153    /// as that counts both zero and one bits.
154    num_rows: u16,
155
156    // TODO(stable-module-abi): should this be inlined into the page?
157    /// For each fixed-length row slot, true if a row is stored there,
158    /// false if the slot is unallocated.
159    ///
160    /// Unallocated row slots store valid-unconstrained bytes, i.e. are never uninit.
161    present_rows: FixedBitSet,
162}
163
164impl MemoryUsage for FixedHeader {
165    fn heap_usage(&self) -> usize {
166        let Self {
167            next_free,
168            last,
169            num_rows,
170            present_rows,
171        } = self;
172        next_free.heap_usage() + last.heap_usage() + num_rows.heap_usage() + present_rows.heap_usage()
173    }
174}
175
176static_assert_size!(FixedHeader, 16);
177
178impl FixedHeader {
179    /// Returns a new `FixedHeader`
180    /// using the provided `max_rows_in_page` to decide how many rows `present_rows` can represent.
181    #[inline]
182    fn new(max_rows_in_page: usize) -> Self {
183        Self {
184            next_free: FreeCellRef::NIL,
185            // Points one after the last allocated fixed-length row, or `NULL` for an empty page.
186            last: PageOffset::VAR_LEN_NULL,
187            num_rows: 0,
188            present_rows: FixedBitSet::new(max_rows_in_page),
189        }
190    }
191
192    /// Set the (fixed) row starting at `offset`
193    /// and lasting `fixed_row_size` as `present`.
194    #[inline]
195    fn set_row_present(&mut self, offset: PageOffset, fixed_row_size: Size) {
196        self.set_row_presence(offset, fixed_row_size, true);
197        self.num_rows += 1;
198    }
199
200    /// Sets whether the (fixed) row starting at `offset`
201    /// and lasting `fixed_row_size` is `present` or not.
202    #[inline]
203    fn set_row_presence(&mut self, offset: PageOffset, fixed_row_size: Size, present: bool) {
204        self.present_rows.set(offset / fixed_row_size, present);
205    }
206
207    /// Returns whether the (fixed) row starting at `offset`
208    /// and lasting `fixed_row_size` is present or not.
209    #[inline]
210    fn is_row_present(&self, offset: PageOffset, fixed_row_size: Size) -> bool {
211        self.present_rows.get(offset / fixed_row_size)
212    }
213
214    /// Resets the header information to its state
215    /// when it was first created in [`FixedHeader::new`]
216    /// but with `max_rows_in_page` instead of the value passed on creation.
217    fn reset_for(&mut self, max_rows_in_page: usize) {
218        self.next_free = FreeCellRef::NIL;
219        self.last = PageOffset::VAR_LEN_NULL;
220        self.num_rows = 0;
221        self.present_rows.reset_for(max_rows_in_page);
222    }
223
224    /// Resets the header information to its state
225    /// when it was first created in [`FixedHeader::new`].
226    ///
227    /// The header is only good for the original row size.
228    #[inline]
229    fn clear(&mut self) {
230        self.next_free = FreeCellRef::NIL;
231        self.last = PageOffset::VAR_LEN_NULL;
232        self.num_rows = 0;
233        self.present_rows.clear();
234    }
235}
236
237/// All the var-len header information.
238#[repr(C)] // Required for a stable ABI.
239#[derive(bytemuck::NoUninit, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
240struct VarHeader {
241    /// A pointer to the head of the freelist which stores
242    /// all the unused (freed) var-len granules.
243    /// These cells can be reused when inserting a new row.
244    next_free: FreeCellRef,
245
246    /// The length of the freelist with its head referred to by `next_free`.
247    /// Stored in units of var-len nodes.
248    ///
249    /// This field is redundant,
250    /// as it can be recovered by traversing `next_free`.
251    /// However, traversing this linked-list is not cache friendly,
252    /// so we memoize the computation here.
253    freelist_len: u16,
254
255    /// High water mark (HWM) for var-len granules.
256    /// Points to the last-allocated (lowest-indexed) var-len granule,
257    /// so to allocate a var-len granule from the gap,
258    /// pre-decrement this index.
259    // TODO(perf,future-work): determine how to "lower" the high water mark when freeing the "top"-most granule.
260    first: PageOffset,
261
262    /// The number of granules currently used by rows within this page.
263    ///
264    /// [`Page::bytes_used_by_rows`] needs this information.
265    /// Stored here because otherwise counting it would require traversing all the present rows.
266    num_granules: u16,
267}
268
269impl MemoryUsage for VarHeader {
270    fn heap_usage(&self) -> usize {
271        let Self {
272            next_free,
273            freelist_len,
274            first,
275            num_granules,
276        } = self;
277        next_free.heap_usage() + freelist_len.heap_usage() + first.heap_usage() + num_granules.heap_usage()
278    }
279}
280
281static_assert_size!(VarHeader, 8);
282
283impl Default for VarHeader {
284    fn default() -> Self {
285        Self {
286            next_free: FreeCellRef::NIL,
287            freelist_len: 0,
288            first: PageOffset::PAGE_END,
289            num_granules: 0,
290        }
291    }
292}
293
294impl VarHeader {
295    /// Resets the header information to its state
296    /// when it was first created in [`VarHeader::default`].
297    fn clear(&mut self) {
298        *self = Self::default();
299    }
300}
301
302/// The metadata / header of a page that is necessary
303/// for modifying and interpreting the `row_data`.
304///
305/// This header info is split into a header for the fixed part
306/// and one for the variable part.
307/// The header is stored in the same heap allocation as the `row_data`
308/// as the whole [`Page`] is `Box`ed.
309#[repr(C)] // Required for a stable ABI.
310#[repr(align(64))] // Alignment must be same as `VarLenGranule::SIZE`.
311#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
312pub(super) struct PageHeader {
313    /// The header data relating to the fixed component of a row.
314    fixed: FixedHeader,
315    /// The header data relating to the var-len component of a row.
316    var: VarHeader,
317    /// The content-addressed hash of the page on disk,
318    /// if unmodified since the last snapshot,
319    /// and `None` otherwise.
320    ///
321    /// This means that modifications to the page always sets this field to `None`.
322    unmodified_hash: Option<blake3::Hash>,
323}
324
325impl MemoryUsage for PageHeader {
326    fn heap_usage(&self) -> usize {
327        let Self {
328            fixed,
329            var,
330            // MEMUSE: no allocation, ok to ignore
331            unmodified_hash: _,
332        } = self;
333        fixed.heap_usage() + var.heap_usage()
334    }
335}
336
337static_assert_size!(PageHeader, PAGE_HEADER_SIZE);
338
339impl PageHeader {
340    /// Returns a new `PageHeader` proper a [`Page`] for holding at most `max_rows_in_page` rows.
341    fn new(max_rows_in_page: usize) -> Self {
342        Self {
343            fixed: FixedHeader::new(max_rows_in_page),
344            var: VarHeader::default(),
345            unmodified_hash: None,
346        }
347    }
348
349    /// Resets the header information to its state
350    /// when it was first created in [`PageHeader::new`]
351    /// but with `max_rows_in_page` instead of the value passed on creation.
352    fn reset_for(&mut self, max_rows_in_page: usize) {
353        self.fixed.reset_for(max_rows_in_page);
354        self.var.clear();
355        self.unmodified_hash = None;
356    }
357
358    /// Resets the header information to its state
359    /// when it was first created in [`PageHeader::new`].
360    ///
361    /// The header is only good for the original row size.
362    fn clear(&mut self) {
363        self.fixed.clear();
364        self.var.clear();
365        self.unmodified_hash = None;
366    }
367
368    /// Returns the maximum number of rows the page can hold.
369    ///
370    /// Note that this number can be bigger
371    /// than the value provided in [`Self::new`] due to rounding up.
372    pub(super) fn max_rows_in_page(&self) -> usize {
373        self.fixed.present_rows.bits()
374    }
375
376    /// Returns a pointer to the `present_rows` bitset.
377    /// This is exposed for testing only.
378    #[cfg(test)]
379    pub(super) fn present_rows_storage_ptr_for_test(&self) -> *const () {
380        self.fixed.present_rows.storage().as_ptr().cast()
381    }
382}
383
384/// Fixed-length row portions must be at least large enough to store a `FreeCellRef`.
385const _MIN_ROW_SIZE_CAN_STORE_FCR: () = assert!(MIN_ROW_SIZE.len() >= mem::size_of::<FreeCellRef>());
386
387/// [`VarLenGranule`]s must be at least large enough to store a [`FreeCellRef`].
388const _VLG_CAN_STORE_FCR: () = assert!(VarLenGranule::SIZE.len() >= MIN_ROW_SIZE.len());
389
390/// Pointers properly aligned for a [`VarLenGranule`] must be properly aligned for [`FreeCellRef`].
391/// This is the case as the former's alignment is a multiple of the latter's alignment.
392const _VLG_ALIGN_MULTIPLE_OF_FCR: () = assert!(mem::align_of::<VarLenGranule>() % mem::align_of::<FreeCellRef>() == 0);
393
394/// The actual row data of a [`Page`].
395type RowData = [Byte; PageOffset::PAGE_END.idx()];
396
397/// A page of row data with an associated `header` and the raw `row_data` itself.
398///
399/// As a rough summary, the strategy employed by this page is:
400///
401/// - The fixed-len parts of rows grows left-to-right
402///   and starts from the beginning of the `row_data`
403///   until its high water mark (fixed HWM), i.e., `self.header.fixed.last`.
404///
405/// - The var-len parts of rows grows right-to-left
406///   and starts from the end of the `row_data`
407///   until its high water mark (variable HWM), i.e., `self.header.var.first`.
408///
409///   Each var-len object is stored in terms of a linked-list of chunks.
410///   Each chunk in this case is a [`VarLenGranule`] taking up 64 bytes where:
411///   - 6 bits = length, 10 bits = next-cell-pointer
412///   - 62 bytes = the bytes of the object
413///
414/// - As new rows are added, the HWMs move appropriately.
415///   When the fixed and variable HWMs meet, the page is full.
416///
417/// - When rows are freed, a freelist strategy is used both for
418///   the fixed parts and each `VarLenGranule`.
419///   These freelists are then used first before using space from the gap.
420///   The head of these freelists are stored in `next_free`
421///   in the fixed and variable headers respectively.
422///
423/// - As the fixed parts of rows may store pointers into the var-length section,
424///   to ensure that these pointers aren't dangling,
425///   the page uses pointer fixups when adding to, deleting from, and copying the page.
426///   These fixups are handled by having callers provide `VarLenMembers`
427///   to find the var-len reference slots in the fixed parts.
428#[repr(C)]
429// ^-- Required for a stable ABI.
430#[repr(align(64))]
431// ^-- Must have align at least that of `VarLenGranule`,
432// so that `row_data[PageOffset::PAGE_END - VarLenGranule::SIZE]` is an aligned pointer to `VarLenGranule`.
433// TODO(bikeshedding): consider raising the alignment. We may want this to be OS page (4096) aligned.
434#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] // So we can dump and restore pages during snapshotting.
435pub struct Page {
436    /// The header containing metadata on how to interpret and modify the `row_data`.
437    header: PageHeader,
438    /// The actual bytes stored in the page.
439    /// This contains row data, fixed and variable, and freelists.
440    row_data: RowData,
441}
442
443impl MemoryUsage for Page {
444    fn heap_usage(&self) -> usize {
445        self.header.heap_usage()
446    }
447}
448
449static_assert_size!(Page, PAGE_SIZE);
450
451/// A mutable view of the fixed-len section of a [`Page`].
452pub struct FixedView<'page> {
453    /// A mutable view of the fixed-len bytes.
454    fixed_row_data: &'page mut Bytes,
455    /// A mutable view of the fixed header.
456    header: &'page mut FixedHeader,
457}
458
459impl FixedView<'_> {
460    /// Returns a mutable view of the row from `start` lasting `fixed_row_size` number of bytes.
461    ///
462    /// This method is safe, but callers should take care that `start` and `fixed_row_size`
463    /// are correct for this page, and that `start` is aligned.
464    /// Callers should further ensure that mutations to the row leave the row bytes
465    /// in an expected state, i.e. initialized where required by the row type,
466    /// and with `VarLenRef`s that point to valid granules and with correct lengths.
467    pub fn get_row_mut(&mut self, start: PageOffset, fixed_row_size: Size) -> &mut Bytes {
468        &mut self.fixed_row_data[start.range(fixed_row_size)]
469    }
470
471    /// Returns a shared view of the row from `start` lasting `fixed_row_size` number of bytes.
472    fn get_row(&mut self, start: PageOffset, fixed_row_size: Size) -> &Bytes {
473        &self.fixed_row_data[start.range(fixed_row_size)]
474    }
475
476    /// Frees the row starting at `row_offset` and lasting `fixed_row_size` bytes.
477    ///
478    /// # Safety
479    ///
480    /// `range_move(0..fixed_row_size, row_offset)` must be in bounds of `row_data`.
481    /// Moreover, it must be valid for writing a `FreeCellRef` to it,
482    /// which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
483    pub unsafe fn free(&mut self, row_offset: PageOffset, fixed_row_size: Size) {
484        // TODO(perf,future-work): if `row` is at the HWM, return it to the gap.
485
486        // SAFETY: Per caller contract, `row_offset` must be in bounds of `row_data`.
487        // Moreover, it must be valid for writing a `FreeCellRef` to it,
488        // which includes being properly aligned with respect to `row_data` for a `FreeCellRef`.
489        // We also know that `self.header.next_free` contains a valid `FreeCellRef`.
490        unsafe {
491            self.header
492                .next_free
493                .prepend_freelist(self.fixed_row_data, row_offset, |x| x)
494        };
495        self.header.num_rows -= 1;
496        self.header.set_row_presence(row_offset, fixed_row_size, false);
497    }
498}
499
500/// A mutable view of the var-len section of a [`Page`].
501pub struct VarView<'page> {
502    /// A mutable view of the var-len bytes.
503    var_row_data: &'page mut Bytes,
504    /// A mutable view of the var-len header.
505    header: &'page mut VarHeader,
506    /// One past the end of the fixed-len section of the page.
507    last_fixed: PageOffset,
508}
509
510impl<'page> VarView<'page> {
511    /// Returns the number of granules required to store the data,
512    /// whether the page has enough space,
513    /// and whether the object needs to go in the blob store.
514    ///
515    /// If the third value is `true`, i.e., the object will go in the blob store,
516    /// the first value will always be `1`.
517    fn has_enough_space_for(&self, len_in_bytes: usize) -> (usize, bool, bool) {
518        let (num_granules_req, in_blob) = VarLenGranule::bytes_to_granules(len_in_bytes);
519        let enough_space = num_granules_req <= self.num_granules_available();
520        (num_granules_req, enough_space, in_blob)
521    }
522
523    /// Returns the number of granules available for allocation.
524    fn num_granules_available(&self) -> usize {
525        self.header.freelist_len as usize
526            + VarLenGranule::space_to_granules(gap_remaining_size(self.header.first, self.last_fixed))
527    }
528
529    /// Provides an adjuster of offset in terms of `Page::row_data`
530    /// to work in terms of `VarView::var_row_data`.
531    ///
532    /// This has to be done due to `page.row_data.split_at_mut(last_fixed)`.
533    #[inline(always)]
534    fn adjuster(&self) -> impl FnOnce(PageOffset) -> PageOffset {
535        let lf = self.last_fixed;
536        move |offset| offset - lf
537    }
538
539    /// Allocates a linked-list of granules, in the var-len storage of the page,
540    /// for a var-len object of `obj_len` bytes.
541    ///
542    /// Returns a [`VarLenRef`] pointing to the head of that list,
543    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
544    /// and the object must be inserted into the large-blob store.
545    ///
546    /// The length of each granule is set, but no data is written to any granule.
547    /// Thus, the caller must proceed to write data to each granule for the claimed lengths.
548    ///
549    /// # Safety post-requirements
550    ///
551    /// The following are the safety *post-requirements* of calling this method.
552    /// That is, this method is safe to call,
553    /// but may leave the page in an inconsistent state
554    /// which must be rectified before other **unsafe methods** may be called.
555    ///
556    /// 1. When the returned `in_blob` holds, caller must ensure that,
557    ///    before the granule's data is read from / assumed to be initialized,
558    ///    the granule pointed to by the returned `vlr.first_granule`
559    ///    has an initialized header and a data section initialized to at least
560    ///    as many bytes as claimed by the header.
561    ///
562    /// 2. The caller must initialize each granule with data for the claimed length
563    ///    of the granule's data.
564    pub fn alloc_for_len(&mut self, obj_len: usize) -> Result<(VarLenRef, bool), Error> {
565        // Safety post-requirements of `alloc_for_obj_common`:
566        // 1. caller promised they will be satisfied.
567        // 2a. already satisfied as the closure below returns all the summands of `obj_len`.
568        // 2b. caller promised in 2. that they will satisfy this.
569        self.alloc_for_obj_common(obj_len, |req_granules| {
570            let rem = obj_len % VarLenGranule::DATA_SIZE;
571            (0..req_granules).map(move |rev_idx| {
572                let len = if rev_idx == 0 && rem != 0 {
573                    // The first allocated granule will be the last in the list.
574                    // Thus, `rev_idx == 0` is the last element and might not take up a full granule.
575                    rem
576                } else {
577                    VarLenGranule::DATA_SIZE
578                };
579                // Caller will initialize the granule's data for `len` bytes.
580                (<&[u8]>::default(), len)
581            })
582        })
583    }
584
585    /// Returns an iterator over all offsets of the `VarLenGranule`s of the var-len object
586    /// that has its first granule at offset `first_granule`.
587    /// An empty iterator will be returned when `first_granule` is `NULL`.
588    ///
589    /// # Safety
590    ///
591    /// `first_granule` must be an offset to a granule or `NULL`.
592    /// The data of the granule need not be initialized.
593    pub unsafe fn granule_offset_iter(&mut self, first_granule: PageOffset) -> GranuleOffsetIter<'_, 'page> {
594        GranuleOffsetIter {
595            next_granule: first_granule,
596            var_view: self,
597        }
598    }
599
600    /// Allocates and stores `slice` as a linked-list of granules
601    /// in the var-len storage of the page.
602    ///
603    /// Returns a [`VarLenRef`] pointing to the head of that list,
604    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
605    /// and the `slice` must be inserted into the large-blob store.
606    ///
607    /// # Safety post-requirements
608    ///
609    /// The following are the safety *post-requirements* of calling this method.
610    /// That is, this method is safe to call,
611    /// but may leave the page in an inconsistent state
612    /// which must be rectified before other **unsafe methods** may be called.
613    ///
614    /// 1. When the returned `in_blob` holds, caller must ensure that,
615    ///    before the granule's data is read from / assumed to be initialized,
616    ///    the granule pointed to by the returned `vlr.first_granule`
617    ///    has an initialized header and a data section initialized to at least
618    ///    as many bytes as claimed by the header.
619    pub fn alloc_for_slice(&mut self, slice: &[u8]) -> Result<(VarLenRef, bool), Error> {
620        let obj_len = slice.len();
621        // Safety post-requirement 2. of `alloc_for_obj_common` is already satisfied
622        // as `chunks(slice)` will return sub-slices where the sum is `obj_len`.
623        // Moreover, we initialize each granule already with the right data and length.
624        // The requirement 1. is forwarded to the caller.
625        let chunks = |_| VarLenGranule::chunks(slice).rev().map(|c| (c, c.len()));
626        self.alloc_for_obj_common(obj_len, chunks)
627    }
628
629    /// Allocates for `obj_len` bytes as a linked-list of granules
630    /// in the var-len storage of the page.
631    ///
632    /// For every granule in the aforementioned linked-list,
633    /// the caller must provide an element in the *reversed* iterator `chunks`,
634    /// and of pairs `(chunk, len)`.
635    /// To each granule `chunk` will be written and the granule will be of length `len`.
636    /// The caller can opt to provide `chunk` that is not of `len`.
637    ///
638    /// Returns a [`VarLenRef`] pointing to the head of that list,
639    /// and a boolean `in_blob` for whether the allocation is a `BlobHash`
640    /// and the `slice` must be inserted into the large-blob store.
641    ///
642    /// # Safety post-requirements
643    ///
644    /// The following are the safety *post-requirements* of calling this method.
645    /// That is, this method is safe to call,
646    /// but may leave the page in an inconsistent state
647    /// which must be rectified before other **unsafe methods** may be called.
648    ///
649    /// 1. When the returned `in_blob` holds, caller must ensure that,
650    ///    before the granule's data is read from / assumed to be initialized,
651    ///    the granule pointed to by the returned `vlr.first_granule`
652    ///    has an initialized header and a data section initialized to at least
653    ///    as many bytes as claimed by the header.
654    ///
655    /// 2. Otherwise, when `in_blob` doesn't hold the safety post-requirements are:
656    ///
657    ///    a. Let `cs = chunks(req_granules)` for the `req_granules` derived from `obj_len`.
658    ///       Then, `obj_len == cs.map(|(_, len)| len).sum()`.
659    ///
660    ///    b. For each `(_, len) ∈ cs`, caller must ensure that
661    ///       the relevant granule is initialized with data for at least `len`
662    ///       before the granule's data is read from / assumed to be initialized.
663    fn alloc_for_obj_common<'chunk, Cs: Iterator<Item = (&'chunk [u8], usize)>>(
664        &mut self,
665        obj_len: usize,
666        chunks: impl Copy + FnOnce(usize) -> Cs,
667    ) -> Result<(VarLenRef, bool), Error> {
668        // Check that we have sufficient space to allocate `obj_len` bytes in var-len data.
669        let (req_granules, enough_space, in_blob) = self.has_enough_space_for(obj_len);
670        if !enough_space {
671            return Err(Error::InsufficientVarLenSpace {
672                need: req_granules.try_into().unwrap_or(u16::MAX),
673                have: self.num_granules_available().try_into().unwrap_or(u16::MAX),
674            });
675        }
676
677        // For large blob objects, only reserve a granule.
678        // The caller promised that they will initialize it with a blob hash.
679        if in_blob {
680            let vlr = self.alloc_blob_hash()?;
681            return Ok((vlr, true));
682        };
683
684        // Write each `chunk` to var-len storage.
685        // To do this, we allocate granules for and store the chunks in reverse,
686        // starting with the end first.
687        // The offset to the previous granule in the iteration is kept to
688        // link it in as the next pointer in the current iteration.
689        let mut next = PageOffset::VAR_LEN_NULL;
690        debug_assert_eq!(obj_len, chunks(req_granules).map(|(_, len)| len).sum::<usize>());
691        for (chunk, len) in chunks(req_granules) {
692            // This should never error, since we already checked for available space.
693            let granule = self.alloc_granule()?;
694            // SAFETY:
695            // 1. `granule` is properly aligned as it came from `alloc_granule`
696            //    and so is `next` as it's either NULL or was the previous `granule`.
697            //    This also ensures that both are in bounds
698            //    of the page for `granule + granule + VarLenGranule::SIZE`.
699            //
700            // 2. `next` is either NULL or was initialized in the previous loop iteration.
701            //
702            // 3. `granule` points to an unused slot as the space was just allocated.
703            unsafe { self.write_chunk_to_granule(chunk, len, granule, next) };
704            next = granule;
705        }
706
707        Ok((
708            VarLenRef {
709                first_granule: next,
710                length_in_bytes: obj_len as u16,
711            },
712            false,
713        ))
714    }
715
716    /// Allocates a granule for a large blob object
717    /// and returns a [`VarLenRef`] pointing to that granule.
718    ///
719    /// The granule is not initialized by this method, and contains valid-unconstrained bytes.
720    /// It is the caller's responsibility to initialize it with a [`BlobHash`](super::blob_hash::BlobHash).
721    #[cold]
722    fn alloc_blob_hash(&mut self) -> Result<VarLenRef, Error> {
723        // Var-len hashes are 32 bytes, which fits within a single granule.
724        self.alloc_granule().map(VarLenRef::large_blob)
725    }
726
727    /// Inserts `var_len_obj` into `blob_store`
728    /// and stores the blob hash in the granule pointed to by `vlr.first_granule`.
729    ///
730    /// This insertion will never fail.
731    ///
732    /// # Safety
733    ///
734    /// `vlr.first_granule` must point to an unused `VarLenGranule` in bounds of this page,
735    /// which must be valid for writes.
736    pub unsafe fn write_large_blob_hash_to_granule(
737        &mut self,
738        blob_store: &mut dyn BlobStore,
739        var_len_obj: &impl AsRef<[u8]>,
740        vlr: VarLenRef,
741    ) -> BlobNumBytes {
742        let hash = blob_store.insert_blob(var_len_obj.as_ref());
743
744        let granule = vlr.first_granule;
745        // SAFETY:
746        // 1. `granule` is properly aligned for `VarLenGranule` and is in bounds of the page.
747        // 2. The null granule is trivially initialized.
748        // 3. The caller promised that `granule` is safe to overwrite.
749        unsafe { self.write_chunk_to_granule(&hash.data, hash.data.len(), granule, PageOffset::VAR_LEN_NULL) };
750        var_len_obj.as_ref().len().into()
751    }
752
753    /// Write the `chunk` (data) to the [`VarLenGranule`] pointed to by `granule`,
754    /// set the granule's length to be `len`,
755    /// and set the next granule in the list to `next`.
756    ///
757    /// SAFETY:
758    ///
759    /// 1. Both `granule` and `next` must be properly aligned pointers to [`VarLenGranule`]s
760    ///    and they must be in bounds of the page. However, neither need to point to init data.
761    ///
762    /// 2. The caller must initialize the granule pointed to by `next`
763    ///    before the granule-list is read from (e.g., iterated on).
764    ///    The null granule is considered trivially initialized.
765    ///
766    /// 3. The space pointed to by `granule` must be unused and valid for writes,
767    ///    and will be overwritten here.
768    unsafe fn write_chunk_to_granule(&mut self, chunk: &[u8], len: usize, granule: PageOffset, next: PageOffset) {
769        let granule = self.adjuster()(granule);
770        // SAFETY: A `PageOffset` is always in bounds of the page.
771        let ptr: *mut VarLenGranule = unsafe { offset_to_ptr_mut(self.var_row_data, granule).cast() };
772
773        // TODO(centril,bikeshedding): check if creating the `VarLenGranule` first on stack
774        // and then writing to `ptr` would have any impact on perf.
775        // This would be nicer as it requires less `unsafe`.
776
777        // We need to initialize `Page::header`
778        // without materializing a `&mut` as that is instant UB.
779        // SAFETY: `ptr` isn't NULL as `&mut self.row_data` itself is a non-null pointer.
780        let header = unsafe { &raw mut (*ptr).header };
781
782        // SAFETY: `header` is valid for writes as only we have exclusive access.
783        //          (1) The `ptr` was also promised as aligned
784        //          and `granule + (granule + 64 bytes)` is in bounds of the page per caller contract.
785        //          (2) Moreover, `next` will be an initialized granule per caller contract,
786        //          so we can link it into the list without causing UB elsewhere.
787        //          (3) It's also OK to write to `granule` as it's unused.
788        unsafe {
789            header.write(VarLenGranuleHeader::new(len as u8, next));
790        }
791
792        // SAFETY: We can treat any part of `row_data` as `.data`. Also (1) and (2).
793        let data = unsafe { &mut (*ptr).data };
794
795        // Copy the data into the granule.
796        data[0..chunk.len()].copy_from_slice(chunk);
797    }
798
799    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`].
800    ///
801    /// The allocated storage is not initialized by this method,
802    /// and will be valid-unconstrained at [`VarLenGranule`].
803    ///
804    /// This offset will be properly aligned for `VarLenGranule` when converted to a pointer.
805    ///
806    /// Returns an error when there are neither free granules nor space in the gap left.
807    fn alloc_granule(&mut self) -> Result<PageOffset, Error> {
808        let granule = self
809            .alloc_from_freelist()
810            .or_else(|| self.alloc_from_gap())
811            .ok_or(Error::InsufficientVarLenSpace { need: 1, have: 0 })?;
812
813        debug_assert!(
814            is_granule_offset_aligned(granule),
815            "Allocated an unaligned var-len granule: {:x}",
816            granule,
817        );
818
819        self.header.num_granules += 1;
820
821        Ok(granule)
822    }
823
824    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`]
825    /// taken from the freelist, if any.
826    #[inline]
827    fn alloc_from_freelist(&mut self) -> Option<PageOffset> {
828        // SAFETY: `header.next_free` points to a `c: FreeCellRef` when the former `.has()`.
829        let free = unsafe {
830            self.header
831                .next_free
832                .take_freelist_head(self.var_row_data, |o| o - self.last_fixed)
833        }?;
834        self.header.freelist_len -= 1;
835        Some(free)
836    }
837
838    /// Allocate a [`VarLenGranule`] at the returned [`PageOffset`]
839    /// taken from the gap, if there is space left, or `None` if there is insufficient space.
840    #[inline]
841    fn alloc_from_gap(&mut self) -> Option<PageOffset> {
842        if gap_enough_size_for_row(self.header.first, self.last_fixed, VarLenGranule::SIZE) {
843            // `var.first` points *at* the lowest-indexed var-len granule,
844            // *not* before it, so pre-decrement.
845            self.header.first -= VarLenGranule::SIZE;
846            Some(self.header.first)
847        } else {
848            None
849        }
850    }
851
852    /// Free a single var-len granule pointed to at by `offset`.
853    ///
854    /// SAFETY: `offset` must point to a valid [`VarLenGranule`].
855    #[inline]
856    unsafe fn free_granule(&mut self, offset: PageOffset) {
857        // TODO(perf,future-work): if `chunk` is at the HWM, return it to the gap.
858        //       Returning a single chunk to the gap is easy,
859        //       but we want to return a whole "run" of sequential freed chunks,
860        //       which requires some bookkeeping (or an O(> n) linked list traversal).
861        self.header.freelist_len += 1;
862        self.header.num_granules -= 1;
863        let adjuster = self.adjuster();
864
865        // SAFETY: Per caller contract, `offset` is a valid `VarLenGranule`,
866        // and is therefore in bounds of the page row data.
867        // By `_VLG_CAN_STORE_FCR`, and as we won't be reading from the granule anymore,
868        // we know that this makes it valid for writing a `FreeCellRef` to it.
869        // Moreover, by `_VLG_ALIGN_MULTIPLE_OF_FCR`,
870        // the derived pointer is properly aligned (64) for a granule
871        // and as `64 % 2 == 0` the alignment of a granule works for a `FreeCellRef`.
872        // Finally, `self.header.next_free` contains a valid `FreeCellRef`.
873        unsafe {
874            self.header
875                .next_free
876                .prepend_freelist(self.var_row_data, offset, adjuster)
877        };
878    }
879
880    /// Returns a reference to the granule at `offset`.
881    ///
882    /// SAFETY: `offset` must point to a valid [`VarLenGranule`].
883    unsafe fn get_granule_ref(&self, offset: PageOffset) -> &VarLenGranule {
884        unsafe { get_ref(self.var_row_data, self.adjuster()(offset)) }
885    }
886
887    /// Frees the blob pointed to by the [`BlobHash`] stored in the granule at `offset`.
888    ///
889    /// Panics when `offset` is NULL.
890    ///
891    /// SAFETY: `offset` must point to a valid [`VarLenGranule`] or be NULL.
892    #[cold]
893    #[inline(never)]
894    unsafe fn free_blob(&self, offset: PageOffset, blob_store: &mut dyn BlobStore) -> BlobNumBytes {
895        assert!(!offset.is_var_len_null());
896
897        // SAFETY: Per caller contract + the assertion above,
898        // we know `offset` refers to a valid `VarLenGranule`.
899        let granule = unsafe { self.get_granule_ref(offset) };
900
901        // Actually free the blob.
902        let hash = granule.blob_hash();
903
904        // The size of `deleted_bytes` is calculated here instead of requesting it from `blob_store`.
905        // This is because the actual number of bytes deleted depends on the `blob_store`'s logic.
906        // We prefer to measure it from the datastore's point of view.
907        let blob_store_deleted_bytes = blob_store
908            .retrieve_blob(&hash)
909            .expect("failed to free var-len blob")
910            .len()
911            .into();
912
913        // Actually free the blob.
914        blob_store.free_blob(&hash).expect("failed to free var-len blob");
915
916        blob_store_deleted_bytes
917    }
918
919    /// Frees an entire var-len linked-list object.
920    ///
921    /// If the `var_len_obj` is a large blob,
922    /// the `VarLenGranule` which stores its blob hash will be freed from the page,
923    /// but the blob itself will not be freed from the blob store.
924    /// If used incorrectly, this may leak large blobs.
925    ///
926    /// This behavior is used to roll-back on failure in `[crate::bflatn::ser::write_av_to_page]`,
927    /// where inserting large blobs is deferred until all allocations succeed.
928    /// Freeing a fully-inserted object should instead use [`Self::free_object`].
929    ///
930    /// # Safety
931    ///
932    /// `var_len_obj.first_granule` must point to a valid [`VarLenGranule`] or be NULL.
933    pub unsafe fn free_object_ignore_blob(&mut self, var_len_obj: VarLenRef) {
934        let mut next_granule = var_len_obj.first_granule;
935
936        while !next_granule.is_var_len_null() {
937            // SAFETY: Per caller contract, `first_granule` points to a valid granule or is NULL.
938            // We know however at this point that it isn't NULL so it is valid.
939            // Thus the successor is too a valid granule or NULL.
940            // However, again, at this point we know that the successor isn't NULL.
941            // It follows then by induction that any `next_granule` at this point is valid.
942            // Thus we have fulfilled the requirement that `next_granule` points to a valid granule.
943            let header = unsafe { self.get_granule_ref(next_granule) }.header;
944            // SAFETY: `next_granule` still points to a valid granule per above.
945            unsafe {
946                self.free_granule(next_granule);
947            }
948            next_granule = header.next();
949        }
950    }
951
952    /// Frees an entire var-len linked-list object.
953    ///
954    /// SAFETY: `var_len_obj.first_granule` must point to a valid [`VarLenGranule`] or be NULL.
955    unsafe fn free_object(&mut self, var_len_obj: VarLenRef, blob_store: &mut dyn BlobStore) -> BlobNumBytes {
956        let mut blob_store_deleted_bytes = BlobNumBytes::default();
957        // For large blob objects, extract the hash and tell `blob_store` to discard it.
958        if var_len_obj.is_large_blob() {
959            // SAFETY: `var_len_obj.first_granule` was promised to
960            // point to a valid [`VarLenGranule`] or be NULL, as required.
961            unsafe {
962                blob_store_deleted_bytes = self.free_blob(var_len_obj.first_granule, blob_store);
963            }
964        }
965
966        // SAFETY: `free_object_ignore_blob` has the same safety contract as this method.
967        unsafe {
968            self.free_object_ignore_blob(var_len_obj);
969        }
970
971        blob_store_deleted_bytes
972    }
973}
974
975/// An iterator yielding the offsets to the granules of a var-len object.
976pub struct GranuleOffsetIter<'vv, 'page> {
977    /// Our mutable view of the page.
978    var_view: &'vv mut VarView<'page>,
979    /// The offset, that will be yielded next, pointing to next granule.
980    next_granule: PageOffset,
981}
982
983impl GranuleOffsetIter<'_, '_> {
984    /// Returns a mutable view of, for the `granule` at `offset`, `granule.data[start..]`.
985    ///
986    /// # Safety
987    ///
988    /// - `offset` must point to a valid granule
989    /// - `start < VarLenGranule::DATA_SIZE`
990    pub unsafe fn get_mut_data(&mut self, offset: PageOffset, start: usize) -> &mut Bytes {
991        // SAFETY: Caller promised that `offset` points o a valid granule.
992        let granule: &mut VarLenGranule = unsafe { get_mut(self.var_view.var_row_data, offset) };
993        // SAFETY: Caller promised `start < granule.data.len()`.
994        unsafe { granule.data.as_mut_slice().get_unchecked_mut(start..) }
995    }
996}
997
998impl Iterator for GranuleOffsetIter<'_, '_> {
999    type Item = PageOffset;
1000    fn next(&mut self) -> Option<Self::Item> {
1001        let adjust = self.var_view.adjuster();
1002
1003        if self.next_granule.is_var_len_null() {
1004            return None;
1005        }
1006        let ret = adjust(self.next_granule);
1007        // SAFETY: By construction,
1008        // the initial `next_granule` was promised to either be `NULL` or point to a valid granule.
1009        // For a given granule, the same applies to its `.next()` granule.
1010        // At this point, we've excluded `NULL`,
1011        // so we know inductively that `next_granule` points to a valid granule, as required.
1012        let granule: &VarLenGranule = unsafe { get_ref(self.var_view.var_row_data, ret) };
1013        self.next_granule = granule.header.next();
1014
1015        Some(ret)
1016    }
1017}
1018
1019/// Assert that `ptr` is sufficiently aligned to reference a value of `T`.
1020///
1021/// In release mode, this is a no-op.
1022fn assert_alignment<T>(ptr: *const Byte) {
1023    debug_assert_eq!(
1024        ptr as usize % mem::align_of::<T>(),
1025        0,
1026        "Wanted a PageOffset with align 0x{:x} (for {}) but found 0x{:x}",
1027        mem::align_of::<T>(),
1028        std::any::type_name::<T>(),
1029        ptr as usize,
1030    );
1031}
1032
1033/// Returns a reference to the [`T`] pointed to at by `offset`.
1034///
1035/// # Safety
1036///
1037/// `offset` must point to a valid `T` in `row_data`.
1038#[inline]
1039pub unsafe fn get_ref<T>(row_data: &Bytes, offset: PageOffset) -> &T {
1040    // SAFETY: Caller promised that `offset` is in bounds of `row_data`.
1041    let ptr = unsafe { offset_to_ptr(row_data, offset) };
1042    assert_alignment::<T>(ptr);
1043    let ptr = ptr.cast::<T>();
1044    // SAFETY: Caller promised that `offset` points to a `T` in `row_data`.
1045    unsafe { &*ptr }
1046}
1047
1048/// Returns a mutable reference to the [`T`] pointed to at by `offset`.
1049///
1050/// # Safety
1051///
1052/// `offset` must point to a valid `T` in `row_data`.
1053#[inline]
1054unsafe fn get_mut<T>(row_data: &mut Bytes, offset: PageOffset) -> &mut T {
1055    // SAFETY: Caller promised that `offset` is in bounds of `row_data`.
1056    let ptr = unsafe { offset_to_ptr_mut(row_data, offset) };
1057    assert_alignment::<T>(ptr as *const Byte);
1058    let ptr = ptr.cast::<T>();
1059    // SAFETY: Caller promised that `offset` points to a `T` in `row_data`.
1060    unsafe { &mut *ptr }
1061}
1062
1063/// Returns a raw const pointer into the `row_data` at `offset` bytes.
1064///
1065/// # Safety
1066///
1067/// `offset` must be in bounds or one past end of `row_data`.
1068#[inline]
1069unsafe fn offset_to_ptr(row_data: &Bytes, offset: PageOffset) -> *const Byte {
1070    debug_assert!(offset.idx() <= row_data.len());
1071
1072    // SAFETY: per caller contract, `offset` is in bounds or one past end of `row_data`.
1073    unsafe { row_data.as_ptr().add(offset.idx()) }
1074}
1075
1076/// Returns a raw mutable pointer into the `row_data` at `offset` bytes.
1077///
1078/// SAFETY: `offset` must be in bounds or one past end of `row_data`.
1079#[inline]
1080unsafe fn offset_to_ptr_mut(row_data: &mut Bytes, offset: PageOffset) -> *mut Byte {
1081    debug_assert!(offset.idx() <= row_data.len());
1082
1083    // SAFETY: per caller contract, `offset` is in bounds or one past end of `row_data`.
1084    unsafe { row_data.as_mut_ptr().add(offset.idx()) }
1085}
1086
1087/// Returns the size of the gap,
1088/// assuming `first_var` is the high water mark (HWM) of the var-len section,
1089/// pointing *at* the granule with the lowest offset,
1090/// and `last_fixed` is the HWM of the fixed-len section,
1091/// pointing *one past the end* of the last fixed row.
1092#[inline]
1093fn gap_remaining_size(first_var: PageOffset, last_fixed: PageOffset) -> Size {
1094    // For illustration, suppose `row_data` is 10 bytes, i.e., `[Byte; 10]`.
1095    // Let's assume the following setup with a full page,
1096    // where capital letters are fixed rows and lower case are variable.
1097    //
1098    // [ A, B, C, D, E, f, g, h, i, j ]
1099    //                  ^
1100    //               first_var
1101    //                  ^
1102    //               last_fixed
1103    //
1104    // The high water mark `first_var` points *at* the granule with the lowest offset (`f`).
1105    // Whereas `last_fixed` points *one past the end* (`f`) of the last fixed row (`E`)
1106    //
1107    // This is the case we have to consider in terms of possible underflow.
1108    // As both HWMs would point at the same place,
1109    // the result would be `0`, and no underflow occurs.
1110    Size((first_var - last_fixed).0)
1111}
1112
1113/// Returns whether the remaining gap is large enough to host an object `fixed_row_size` large,
1114/// assuming `first_var` is the high water mark (HWM) of the var-len section,
1115/// pointing *at* the granule with the lowest offset,
1116/// and `last_fixed` is the HWM of the fixed-len section,
1117/// pointing *one past the end* of the last fixed row.
1118#[inline]
1119fn gap_enough_size_for_row(first_var: PageOffset, last_fixed: PageOffset, fixed_row_size: Size) -> bool {
1120    gap_remaining_size(first_var, last_fixed) >= fixed_row_size
1121}
1122
1123impl Page {
1124    /// Returns a new page allocated on the heap.
1125    ///
1126    /// The new page supports a rows with `fixed_row_size`.
1127    pub fn new(fixed_row_size: Size) -> Box<Self> {
1128        Self::new_with_max_row_count(max_rows_in_page(fixed_row_size))
1129    }
1130
1131    /// Returns a new page allocated on the heap.
1132    ///
1133    /// The new page supports `max_rows_in_page` at most.
1134    pub fn new_with_max_row_count(max_rows_in_page: usize) -> Box<Self> {
1135        // TODO(perf): mmap? allocator may do so already.
1136        // mmap may be more efficient as we save allocator metadata.
1137        use std::alloc::{alloc_zeroed, handle_alloc_error, Layout};
1138
1139        let layout = Layout::new::<Page>();
1140
1141        // Allocate with `alloc_zeroed` so that the bytes are initially 0, rather than uninit.
1142        // We will never write an uninit byte into the page except in the `PageHeader`,
1143        // so it is safe for `row_data` to have type `[u8; _]` rather than `[MaybeUninit<u8>; _]`.
1144        // `alloc_zeroed` may be more efficient than `alloc` + `memset`;
1145        // in particular, it may `mmap` pages directly from the OS, which are always zeroed for security reasons.
1146        // TODO: use Box::new_zeroed() once stabilized.
1147        // SAFETY: The layout's size is non-zero.
1148        let raw: *mut Page = unsafe { alloc_zeroed(layout) }.cast();
1149
1150        if raw.is_null() {
1151            handle_alloc_error(layout);
1152        }
1153
1154        // We need to initialize `Page::header`
1155        // without materializing a `&mut` as that is instant UB.
1156        // SAFETY: `raw` isn't NULL.
1157        let header = unsafe { &raw mut (*raw).header };
1158
1159        // SAFETY: `header` is valid for writes as only we have exclusive access.
1160        //          The pointer is also aligned.
1161        unsafe { header.write(PageHeader::new(max_rows_in_page)) };
1162
1163        // SAFETY: We used the global allocator with a layout for `Page`.
1164        //         We have initialized the `header`,
1165        //         and the `row_bytes` are initially 0 by `alloc_zeroed`,
1166        //         making the pointee a `Page` valid for reads and writes.
1167        unsafe { Box::from_raw(raw) }
1168    }
1169
1170    /// Returns the number of rows stored in this page.
1171    ///
1172    /// This method runs in constant time.
1173    pub fn num_rows(&self) -> usize {
1174        self.header.fixed.num_rows as usize
1175    }
1176
1177    #[cfg(test)]
1178    /// Use this page's present rows bitvec to compute the number of present rows.
1179    ///
1180    /// This can be compared with [`Self::num_rows`] as a consistency check during tests.
1181    pub fn reconstruct_num_rows(&self) -> usize {
1182        // If we cared, we could rewrite this to `u64::count_ones` on each block of the bitset.
1183        // We do not care. This method is slow.
1184        self.header.fixed.present_rows.iter_set().count()
1185    }
1186
1187    /// Returns the number of var-len granules allocated in this page.
1188    ///
1189    /// This method runs in constant time.
1190    pub fn num_var_len_granules(&self) -> usize {
1191        self.header.var.num_granules as usize
1192    }
1193
1194    #[cfg(test)]
1195    /// # Safety
1196    ///
1197    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
1198    ///   specialized to the type and layout of rows within this [`Page`].
1199    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
1200    ///   which must further be the length of rows expected by the `var_len_visitor`.
1201    pub unsafe fn reconstruct_num_var_len_granules(
1202        &self,
1203        fixed_row_size: Size,
1204        var_len_visitor: &impl VarLenMembers,
1205    ) -> usize {
1206        self.iter_fixed_len(fixed_row_size)
1207            .flat_map(|row| unsafe {
1208                // Safety: `row` came out of `iter_fixed_len`,
1209                // which, due to caller requirements on `fixed_row_size`,
1210                // is giving us valid, aligned, initialized rows of the row type.
1211                var_len_visitor.visit_var_len(self.get_row_data(row, fixed_row_size))
1212            })
1213            .flat_map(|var_len_obj| unsafe {
1214                // Safety: We believe `row` to be valid
1215                // and `var_len_visitor` to be correctly visiting its var-len members.
1216                // Therefore, `var_len_obj` is a valid var-len object.
1217                self.iter_var_len_object(var_len_obj.first_granule)
1218            })
1219            .count()
1220    }
1221
1222    /// Returns the number of bytes used by rows stored in this page.
1223    ///
1224    /// This is necessarily an overestimate of live data bytes, as it includes:
1225    /// - Padding bytes within the fixed-length portion of the rows.
1226    /// - [`VarLenRef`] pointer-like portions of rows.
1227    /// - Unused trailing parts of partially-filled [`VarLenGranule`]s.
1228    /// - [`VarLenGranule`]s used to store [`BlobHash`]es.
1229    ///
1230    /// Note that large blobs themselves are not counted.
1231    /// The caller should obtain a count of the bytes used by large blobs
1232    /// from the [`super::blob_store::BlobStore`].
1233    ///
1234    /// This method runs in constant time.
1235    pub fn bytes_used_by_rows(&self, fixed_row_size: Size) -> usize {
1236        let fixed_row_bytes = self.num_rows() * fixed_row_size.len();
1237        let var_len_bytes = self.num_var_len_granules() * VarLenGranule::SIZE.len();
1238        fixed_row_bytes + var_len_bytes
1239    }
1240
1241    #[cfg(test)]
1242    /// # Safety
1243    ///
1244    /// - `var_len_visitor` must be a valid [`VarLenMembers`] visitor
1245    ///   specialized to the type and layout of rows within this [`Page`].
1246    /// - `fixed_row_size` must be exactly the length in bytes of fixed rows in this page,
1247    ///   which must further be the length of rows expected by the `var_len_visitor`.
1248    pub unsafe fn reconstruct_bytes_used_by_rows(
1249        &self,
1250        fixed_row_size: Size,
1251        var_len_visitor: &impl VarLenMembers,
1252    ) -> usize {
1253        let fixed_row_bytes = self.reconstruct_num_rows() * fixed_row_size.len();
1254        let var_len_bytes = unsafe { self.reconstruct_num_var_len_granules(fixed_row_size, var_len_visitor) }
1255            * VarLenGranule::SIZE.len();
1256        fixed_row_bytes + var_len_bytes
1257    }
1258
1259    /// Returns the range of row data starting at `offset` and lasting `size` bytes.
1260    pub fn get_row_data(&self, row: PageOffset, size: Size) -> &Bytes {
1261        &self.row_data[row.range(size)]
1262    }
1263
1264    /// Returns whether the row at `offset` is present or not.
1265    pub fn has_row_offset(&self, fixed_row_size: Size, offset: PageOffset) -> bool {
1266        // Check that the `offset` is properly aligned for a row of size `fixed_row_size`.
1267        // This cannot be `debug_assert!` as the caller could rely on this
1268        // reporting properly whether `offset` is at a row boundary or not.
1269        assert_eq!(offset.idx() % fixed_row_size.len(), 0);
1270
1271        self.header.fixed.is_row_present(offset, fixed_row_size)
1272    }
1273
1274    /// Returns split mutable views of this page over the fixed and variable sections.
1275    pub fn split_fixed_var_mut(&mut self) -> (FixedView<'_>, VarView<'_>) {
1276        // The fixed HWM (`fixed.last`) points *one past the end* of the fixed section
1277        // which is exactly what we want for `split_at_mut`.
1278        let last_fixed = self.header.fixed.last;
1279        let (fixed_row_data, var_row_data) = self.row_data.split_at_mut(last_fixed.idx());
1280
1281        // Construct the fixed-len view.
1282        let fixed = FixedView {
1283            fixed_row_data,
1284            header: &mut self.header.fixed,
1285        };
1286
1287        // Construct the var-len view.
1288        let var = VarView {
1289            var_row_data,
1290            header: &mut self.header.var,
1291            last_fixed,
1292        };
1293
1294        (fixed, var)
1295    }
1296
1297    /// Returns a mutable view of the row from `start` lasting `fixed_row_size` number of bytes.
1298    ///
1299    /// This method is safe, but callers should take care that `start` and `fixed_row_size`
1300    /// are correct for this page, and that `start` is aligned.
1301    /// Callers should further ensure that mutations to the row leave the row bytes
1302    /// in an expected state, i.e. initialized where required by the row type,
1303    /// and with `VarLenRef`s that point to valid granules and with correct lengths.
1304    ///
1305    /// This call will clear the unmodified hash
1306    /// as it is expected that the caller will alter the the page.
1307    pub fn get_fixed_row_data_mut(&mut self, start: PageOffset, fixed_row_size: Size) -> &mut Bytes {
1308        self.header.unmodified_hash = None;
1309        &mut self.row_data[start.range(fixed_row_size)]
1310    }
1311
1312    /// Return the total required var-len granules to store `objects`.
1313    pub fn total_granules_required_for_objects(objects: &[impl AsRef<[u8]>]) -> usize {
1314        objects
1315            .iter()
1316            .map(|obj| VarLenGranule::bytes_to_granules(obj.as_ref().len()).0)
1317            .sum()
1318    }
1319
1320    /// Does the page have space to store a row,
1321    /// where the fixed size part is `fixed_row_size` bytes large,
1322    /// and the row has the given `var_len_objects`?
1323    pub fn has_space_for_row_with_objects(&self, fixed_row_size: Size, var_len_objects: &[impl AsRef<[u8]>]) -> bool {
1324        let num_granules_required = Self::total_granules_required_for_objects(var_len_objects);
1325        self.has_space_for_row(fixed_row_size, num_granules_required)
1326    }
1327
1328    /// Does the page have space to store a row,
1329    /// where the fixed size part is `fixed_row_size` bytes large,
1330    /// and the variable part requires `num_granules`.
1331    pub fn has_space_for_row(&self, fixed_row_size: Size, num_granules: usize) -> bool {
1332        // Determine the gap remaining after allocating for the fixed part.
1333        let gap_remaining = gap_remaining_size(self.header.var.first, self.header.fixed.last);
1334        let gap_avail_for_granules = if self.header.fixed.next_free.has() {
1335            // If we have a free fixed length block, then we can use the whole gap for var-len granules.
1336            gap_remaining
1337        } else {
1338            // If we need to grow the fixed-length store into the gap,
1339            if gap_remaining < fixed_row_size {
1340                // if the gap is too small for fixed-length row, fail.
1341                return false;
1342            }
1343            // Otherwise, the space available in the gap for var-len granules
1344            // is the current gap size less the fixed-len row size.
1345            gap_remaining - fixed_row_size
1346        };
1347
1348        // Convert the gap size to granules.
1349        let gap_in_granules = VarLenGranule::space_to_granules(gap_avail_for_granules);
1350        // Account for granules available in the freelist.
1351        let needed_granules_after_freelist = num_granules.saturating_sub(self.header.var.freelist_len as usize);
1352
1353        gap_in_granules >= needed_granules_after_freelist
1354    }
1355
1356    /// Returns whether the row is full with respect to storing a fixed row with `fixed_row_size`
1357    /// and no variable component.
1358    pub fn is_full(&self, fixed_row_size: Size) -> bool {
1359        !self.has_space_for_row(fixed_row_size, 0)
1360    }
1361
1362    /// Will leave partially-allocated chunks if fails prematurely,
1363    /// so always check `Self::has_space_for_row` before calling.
1364    ///
1365    /// This method is provided for testing the page store directly;
1366    /// higher-level codepaths are expected to use [`crate::bflatn::ser::write_av_to_page`],
1367    /// which performs similar operations to this method,
1368    /// but handles rollback on failure appropriately.
1369    ///
1370    /// This function will never fail if `Self::has_space_for_row` has returned true.
1371    ///
1372    /// # Safety
1373    ///
1374    /// - `var_len_visitor` is suitable for visiting var-len refs in `fixed_row`.
1375    ///
1376    /// - `fixed_row.len()` must be consistent with `var_len_visitor` and `self`.
1377    ///   That is, `VarLenMembers` must be specialized for a row type with that length,
1378    ///   and all past, present, and future fixed-length rows stored in this `Page`
1379    ///   must also be of that length.
1380    pub unsafe fn insert_row(
1381        &mut self,
1382        fixed_row: &Bytes,
1383        var_len_objects: &[impl AsRef<[u8]>],
1384        var_len_visitor: &impl VarLenMembers,
1385        blob_store: &mut dyn BlobStore,
1386    ) -> Result<PageOffset, Error> {
1387        // Allocate the fixed-len row.
1388        let fixed_row_size = Size(fixed_row.len() as u16);
1389
1390        // SAFETY: Caller promised that `fixed_row.len()` uses the right `fixed_row_size`
1391        // and we trust that others have too.
1392        let fixed_len_offset = unsafe { self.alloc_fixed_len(fixed_row_size)? };
1393
1394        // Store the fixed-len row.
1395        let (mut fixed, mut var) = self.split_fixed_var_mut();
1396        let row = fixed.get_row_mut(fixed_len_offset, fixed_row_size);
1397        row.copy_from_slice(fixed_row);
1398
1399        // Store all var-len refs into their appropriate slots in the fixed-len row.
1400        // SAFETY:
1401        // - The `fixed_len_offset` given by `alloc_fixed_len` results in `row`
1402        //   being properly aligned for the row type.
1403        // - Caller promised that `fixed_row.len()` matches the row type size exactly.
1404        // - `var_len_visitor` is suitable for `fixed_row`.
1405        let vlr_slot_iter = unsafe { var_len_visitor.visit_var_len_mut(row) };
1406        for (var_len_ref_slot, var_len_obj) in vlr_slot_iter.zip(var_len_objects) {
1407            let (var_len_ref, in_blob) = var.alloc_for_slice(var_len_obj.as_ref())?;
1408            if in_blob {
1409                // The blob store insertion will never fail.
1410                // SAFETY: `alloc_for_slice` always returns a pointer
1411                // to a `VarLenGranule` in bounds of this page.
1412                // As `in_blob` holds, it is also unused, as required.
1413                // We'll now make that granule valid.
1414                unsafe {
1415                    var.write_large_blob_hash_to_granule(blob_store, var_len_obj, var_len_ref);
1416                }
1417            }
1418            *var_len_ref_slot = var_len_ref;
1419        }
1420
1421        Ok(fixed_len_offset)
1422    }
1423
1424    /// Allocates space for a fixed size row of `fixed_row_size` bytes.
1425    ///
1426    /// # Safety
1427    ///
1428    /// `fixed_row_size` must be equal to the value passed
1429    /// to all other methods ever invoked on `self`.
1430    pub unsafe fn alloc_fixed_len(&mut self, fixed_row_size: Size) -> Result<PageOffset, Error> {
1431        self.alloc_fixed_len_from_freelist(fixed_row_size)
1432            .or_else(|| self.alloc_fixed_len_from_gap(fixed_row_size))
1433            .ok_or(Error::InsufficientFixedLenSpace { need: fixed_row_size })
1434    }
1435
1436    /// Allocates a space for a fixed size row of `fixed_row_size` in the freelist, if possible.
1437    ///
1438    /// This call will clear the unmodified hash.
1439    #[inline]
1440    fn alloc_fixed_len_from_freelist(&mut self, fixed_row_size: Size) -> Option<PageOffset> {
1441        let header = &mut self.header.fixed;
1442        // SAFETY: `header.next_free` points to a `FreeCellRef` when the former `.has()`.
1443        let free = unsafe { header.next_free.take_freelist_head(&self.row_data, |x| x) }?;
1444        header.set_row_present(free, fixed_row_size);
1445
1446        // We are and have modified the page, so clear the unmodified hash.
1447        self.header.unmodified_hash = None;
1448
1449        Some(free)
1450    }
1451
1452    /// Allocates a space for a fixed size row of `fixed_row_size` in the freelist, if possible.
1453    ///
1454    /// This call will clear the unmodified hash.
1455    #[inline]
1456    fn alloc_fixed_len_from_gap(&mut self, fixed_row_size: Size) -> Option<PageOffset> {
1457        if gap_enough_size_for_row(self.header.var.first, self.header.fixed.last, fixed_row_size) {
1458            // We're modifying the page, so clear the unmodified hash.
1459            self.header.unmodified_hash = None;
1460
1461            // Enough space in the gap; move the high water mark and return the old HWM.
1462            // `fixed.last` points *after* the highest-indexed fixed-len row,
1463            // so post-increment.
1464            let ptr = self.header.fixed.last;
1465            self.header.fixed.last += fixed_row_size;
1466            self.header.fixed.set_row_present(ptr, fixed_row_size);
1467            Some(ptr)
1468        } else {
1469            // Not enough space in the gap for another row!
1470            None
1471        }
1472    }
1473
1474    /// Returns an iterator over all the [`PageOffset`]s of the fixed rows in this page
1475    /// beginning with `starting_from`.
1476    ///
1477    /// The rows are assumed to be `fixed_row_size` bytes long
1478    /// and `starting_from` is assumed to be at a valid starting `PageOffset` for a fixed row.
1479    ///
1480    /// NOTE: This method is not `unsafe` as it cannot trigger UB.
1481    /// However, when provided with garbage input, it will return garbage back.
1482    /// It is the caller's responsibility to ensure that `PageOffset`s derived from
1483    /// this iterator are valid when used to do anything `unsafe`.
1484    fn iter_fixed_len_from(&self, fixed_row_size: Size, starting_from: PageOffset) -> FixedLenRowsIter<'_> {
1485        let idx = starting_from / fixed_row_size;
1486        FixedLenRowsIter {
1487            idx_iter: self.header.fixed.present_rows.iter_set_from(idx),
1488            fixed_row_size,
1489        }
1490    }
1491
1492    /// Returns an iterator over all the [`PageOffset`]s of the fixed rows in this page.
1493    ///
1494    /// The rows are assumed to be `fixed_row_size` bytes long.
1495    ///
1496    /// NOTE: This method is not `unsafe` as it cannot trigger UB.
1497    /// However, when provided with garbage input, it will return garbage back.
1498    /// It is the caller's responsibility to ensure that `PageOffset`s derived from
1499    /// this iterator are valid when used to do anything `unsafe`.
1500    pub fn iter_fixed_len(&self, fixed_row_size: Size) -> FixedLenRowsIter<'_> {
1501        FixedLenRowsIter {
1502            idx_iter: self.header.fixed.present_rows.iter_set(),
1503            fixed_row_size,
1504        }
1505    }
1506
1507    /// Returns an iterator over all the `VarLenGranule`s of the var-len object
1508    /// that has its first granule at offset `first_granule`.
1509    /// An empty iterator will be returned when `first_granule` is `NULL`.
1510    ///
1511    /// # Safety
1512    ///
1513    /// `first_granule` must be an offset to a valid granule or `NULL`.
1514    pub unsafe fn iter_var_len_object(
1515        &self,
1516        first_granule: PageOffset,
1517    ) -> impl Clone + Iterator<Item = &VarLenGranule> {
1518        VarLenGranulesIter {
1519            page: self,
1520            next_granule: first_granule,
1521        }
1522    }
1523
1524    /// Returns an iterator over the data of all the `VarLenGranule`s of the var-len object
1525    /// that has its first granule at offset `first_granule`.
1526    /// An empty iterator will be returned when `first_granule` is `NULL`.
1527    ///
1528    /// # Safety
1529    ///
1530    /// `first_granule` must be an offset to a valid granule or `NULL`.
1531    pub unsafe fn iter_vlo_data(&self, first_granule: PageOffset) -> impl '_ + Clone + Iterator<Item = &[u8]> {
1532        // SAFETY: Caller and callee have the exact same safety requirements.
1533        unsafe { self.iter_var_len_object(first_granule) }.map(|g| g.data())
1534    }
1535
1536    /// Free a row, marking its fixed-len and var-len storage granules as available for re-use.
1537    ///
1538    /// This call will clear the unmodified hash.
1539    ///
1540    /// # Safety
1541    ///
1542    /// - `fixed_row` must point to a valid row in this page.
1543    ///
1544    /// - `fixed_row_size` must be the size in bytes of the fixed part
1545    ///   of all past, present, and future rows in this page and future rows in this page.
1546    ///
1547    /// - The `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1548    ///   as the visitor provided to `insert_row`.
1549    pub unsafe fn delete_row(
1550        &mut self,
1551        fixed_row: PageOffset,
1552        fixed_row_size: Size,
1553        var_len_visitor: &impl VarLenMembers,
1554        blob_store: &mut dyn BlobStore,
1555    ) -> BlobNumBytes {
1556        // We're modifying the page, so clear the unmodified hash.
1557        self.header.unmodified_hash = None;
1558
1559        let (mut fixed, mut var) = self.split_fixed_var_mut();
1560
1561        let mut blob_store_deleted_bytes = BlobNumBytes::default();
1562
1563        // Visit the var-len members of the fixed row and free them.
1564        let row = fixed.get_row(fixed_row, fixed_row_size);
1565        // SAFETY: `row` is derived from `fixed_row`, which is known by caller requirements to be valid.
1566        let var_len_refs = unsafe { var_len_visitor.visit_var_len(row) };
1567        for var_len_ref in var_len_refs {
1568            // SAFETY: A sound call to `visit_var_len` on a fully initialized valid row,
1569            // which we've justified that the above is,
1570            // returns an iterator, that will only yield `var_len_ref`s,
1571            // where `var_len_ref.first_granule` points to a valid `VarLenGranule` or is NULL.
1572            blob_store_deleted_bytes += unsafe { var.free_object(*var_len_ref, blob_store) }
1573        }
1574
1575        // SAFETY: Caller promised that `fixed_row` points to a valid row in the page.
1576        // Thus, `range_move(0..fixed_row_size, fixed_row)` is in bounds of `row_data`.
1577        // Moreover, this entails that it is valid for writing a `FreeCellRef`
1578        // to the beginning or entire range, as any row can at least hold a `FreeCellRef`
1579        // and will be properly aligned for it as well.
1580        unsafe {
1581            fixed.free(fixed_row, fixed_row_size);
1582        }
1583
1584        blob_store_deleted_bytes
1585    }
1586
1587    /// Returns the total number of granules used by the fixed row at `fixed_row_offset`
1588    /// and lasting `fixed_row_size` bytes where `var_len_visitor` is used to find
1589    /// the [`VarLenRef`]s in the fixed row.
1590    ///
1591    /// # Safety
1592    ///
1593    /// - `fixed_row_offset` must refer to a previously-allocated and initialized row in `self`,
1594    ///   and must not have been de-allocated. In other words, the fixed row must be *valid*.
1595    ///
1596    /// - `fixed_row_size` and `var_len_visitor` must be consistent with each other
1597    ///   and with all other calls to any methods on `self`.
1598    pub unsafe fn row_total_granules(
1599        &self,
1600        fixed_row_offset: PageOffset,
1601        fixed_row_size: Size,
1602        var_len_visitor: &impl VarLenMembers,
1603    ) -> usize {
1604        let fixed_row = self.get_row_data(fixed_row_offset, fixed_row_size);
1605        // SAFETY:
1606        // - Caller promised that `fixed_row_offset` is a valid row.
1607        // - Caller promised consistency of `var_len_visitor` wrt. `fixed_row_size` and this page.
1608        let vlr_iter = unsafe { var_len_visitor.visit_var_len(fixed_row) };
1609        vlr_iter.copied().map(|slot| slot.granules_used()).sum()
1610    }
1611
1612    /// Copy as many rows from `self` for which `filter` returns `true` into `dst` as will fit,
1613    /// starting from `starting_from`.
1614    ///
1615    /// If less than the entirety of `self` could be processed, return `Continue(resume_point)`,
1616    /// where `resume_point` is the `starting_from` argument of a subsequent call to `copy_filter_into`
1617    /// that will complete the iteration.
1618    /// `dst` should be assumed to be full in this case,
1619    /// as it does not contain enough free space to store the row of `self` at `resume_point`.
1620    ///
1621    /// If the entirety of `self` is processed, return `Break`.
1622    /// `dst` may or may not be full in this case, but is likely not full.
1623    ///
1624    /// # Safety
1625    ///
1626    /// The `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1627    /// as the visitor provided to all other methods on `self` and `dst`.
1628    ///
1629    /// The `fixed_row_size` must be consistent with the `var_len_visitor`,
1630    /// and be equal to the value provided to all other methods on `self` and `dst`.
1631    ///
1632    /// The `starting_from` offset must point to a valid starting offset
1633    /// consistent with `fixed_row_size`.
1634    /// That is, it must not point into the middle of a row.
1635    pub unsafe fn copy_filter_into(
1636        &self,
1637        starting_from: PageOffset,
1638        dst: &mut Page,
1639        fixed_row_size: Size,
1640        var_len_visitor: &impl VarLenMembers,
1641        blob_store: &mut dyn BlobStore,
1642        mut filter: impl FnMut(&Page, PageOffset) -> bool,
1643    ) -> ControlFlow<(), PageOffset> {
1644        for row_offset in self
1645            .iter_fixed_len_from(fixed_row_size, starting_from)
1646            // Only copy rows satisfying the predicate `filter`.
1647            .filter(|o| filter(self, *o))
1648        {
1649            // SAFETY:
1650            // - `starting_from` points to a valid row and thus `row_offset` also does.
1651            // - `var_len_visitor` will visit the right `VarLenRef`s and is consistent with other calls.
1652            // - `fixed_row_size` is consistent with `var_len_visitor` and `self`.
1653            if !unsafe { self.copy_row_into(row_offset, dst, fixed_row_size, var_len_visitor, blob_store) } {
1654                // Target doesn't have enough space for row;
1655                // stop here and return the offset of the uncopied row
1656                // so a later call to `copy_filter_into` can start there.
1657                return ControlFlow::Continue(row_offset);
1658            }
1659        }
1660
1661        // The `for` loop completed.
1662        // We successfully copied the entire page of `self` into `target`.
1663        // The caller doesn't need to resume from this offset.
1664        ControlFlow::Break(())
1665    }
1666
1667    /// Copies the row at `row_offset` from `self` into `dst`
1668    /// or returns `false` otherwise if `dst` has no space for the row.
1669    ///
1670    /// # Safety
1671    ///
1672    /// - `row_offset` offset must point to a valid row.
1673    ///
1674    /// - `var_len_visitor` must visit the same set of `VarLenRef`s in the row
1675    ///   as the visitor provided to all other methods on `self` and `dst`.
1676    ///
1677    /// - `fixed_row_size` must be consistent with the `var_len_visitor`,
1678    ///   and be equal to the value provided to all other methods on `self` and `dst`.
1679    unsafe fn copy_row_into(
1680        &self,
1681        row_offset: PageOffset,
1682        dst: &mut Page,
1683        fixed_row_size: Size,
1684        var_len_visitor: &impl VarLenMembers,
1685        blob_store: &mut dyn BlobStore,
1686    ) -> bool {
1687        // SAFETY: Caller promised that `starting_from` points to a valid row
1688        // consistent with `fixed_row_size` which was also
1689        // claimed to be consistent with `var_len_visitor` and `self`.
1690        let required_granules = unsafe { self.row_total_granules(row_offset, fixed_row_size, var_len_visitor) };
1691        if !dst.has_space_for_row(fixed_row_size, required_granules) {
1692            // Target doesn't have enough space for row.
1693            return false;
1694        };
1695
1696        let src_row = self.get_row_data(row_offset, fixed_row_size);
1697
1698        // Allocate for the fixed-len data.
1699        // SAFETY: forward our requirement on `fixed_row_size` to `alloc_fixed_len`.
1700        let inserted_offset = unsafe { dst.alloc_fixed_len(fixed_row_size) }
1701            .expect("Failed to allocate fixed-len row in dst page after checking for available space");
1702
1703        // Copy all fixed-len data. We'll overwrite the var-len parts next.
1704        let (mut dst_fixed, mut dst_var) = dst.split_fixed_var_mut();
1705        let dst_row = dst_fixed.get_row_mut(inserted_offset, fixed_row_size);
1706        dst_row.copy_from_slice(src_row);
1707
1708        // Copy var-len members into target.
1709        // Fixup `VarLenRef`s in `dst_row` to point to the copied var-len objects.
1710        //
1711        // SAFETY: `src_row` is valid because it came from `self.iter_fixed_len_from`.
1712        //
1713        //         Forward our safety requirements re: `var_len_visitor` to `visit_var_len`.
1714        let src_vlr_iter = unsafe { var_len_visitor.visit_var_len(src_row) };
1715        // SAFETY: forward our requirement on `var_len_visitor` to `visit_var_len_mut`.
1716        let target_vlr_iter = unsafe { var_len_visitor.visit_var_len_mut(dst_row) };
1717        for (src_vlr, target_vlr_slot) in src_vlr_iter.zip(target_vlr_iter) {
1718            // SAFETY:
1719            //
1720            // - requirements of `visit_var_len_assume_init` were met,
1721            //   so we can assume that `src_vlr.first_granule` points to a valid granule or is NULL.
1722            //
1723            // - the call to `dst.has_space_for_row` above ensures
1724            //   that the allocation will not fail part-way through.
1725            let target_vlr_fixup = unsafe { self.copy_var_len_into(*src_vlr, &mut dst_var, blob_store) }
1726                .expect("Failed to allocate var-len object in dst page after checking for available space");
1727
1728            *target_vlr_slot = target_vlr_fixup;
1729        }
1730
1731        true
1732    }
1733
1734    /// Copy a var-len object `src_vlr` from `self` into `dst_var`,
1735    /// and return the `VarLenRef` to the copy in `dst_var`.
1736    ///
1737    /// If the `src_vlr` is empty,
1738    /// i.e., has `first_granule.is_null()` and `length_in_bytes == 0`,
1739    /// this will return `VarLenRef::NULL`.
1740    ///
1741    /// # SAFETY:
1742    ///
1743    /// - `src_vlr.first_granule` must point to a valid granule or be NULL.
1744    ///
1745    /// - To avoid leaving dangling uninitialized allocations in `dst_var`,
1746    ///   `dst_var` must already be checked to have enough size to store `src_vlr`
1747    ///   using `Self::has_space_for_row`.
1748    unsafe fn copy_var_len_into(
1749        &self,
1750        src_vlr: VarLenRef,
1751        dst_var: &mut VarView<'_>,
1752        blob_store: &mut dyn BlobStore,
1753    ) -> Result<VarLenRef, Error> {
1754        // SAFETY: Caller promised that `src_vlr.first_granule` points to a valid granule is be NULL.
1755        let mut iter = unsafe { self.iter_var_len_object(src_vlr.first_granule) };
1756
1757        // If the `src_vlr` is empty, don't copy anything, and return null.
1758        let Some(mut src_chunk) = iter.next() else {
1759            debug_assert!(src_vlr.length_in_bytes == 0);
1760            return Ok(VarLenRef::NULL);
1761        };
1762        let mut dst_chunk = dst_var.alloc_granule()?;
1763
1764        let copied_head = dst_chunk;
1765
1766        // Weird-looking iterator so we can put the next-pointer into `copied_chunk`.
1767        for next_src_chunk in iter {
1768            // Allocate space for the next granule so we can initialize it in the next iteration.
1769            let next_dst_chunk = dst_var.alloc_granule()?;
1770            let data = src_chunk.data();
1771            // Initialize `dst_chunk` with data and next-pointer.
1772            //
1773            // SAFETY:
1774            // 1. `dst_chunk` is properly aligned as it came from `alloc_granule` either
1775            //    before the loop or in the previous iteration.
1776            //    This also ensures that both are in bounds
1777            //    of the page for `granule + granule + VarLenGranule::SIZE`.
1778            //
1779            // 2. `next_dst_chunk` will be initialized
1780            //    either in the next iteration or after the loop ends.
1781            //
1782            // 3. `dst_chunk` points to unused data as the space was allocated before the loop
1783            //    or was `next_dst_chunk` in the previous iteration and hasn't been written to yet.
1784            unsafe { dst_var.write_chunk_to_granule(data, data.len(), dst_chunk, next_dst_chunk) };
1785            dst_chunk = next_dst_chunk;
1786            src_chunk = next_src_chunk;
1787        }
1788
1789        let data = src_chunk.data();
1790        // The last granule has null as next-pointer.
1791        //
1792        // SAFETY:
1793        // 1. `dst_chunk` is properly aligned as it came from `alloc_granule` either
1794        //    before the loop or in the previous iteration.
1795        //    This also ensures that both are in bounds
1796        //    of the page for `granule + granule + VarLenGranule::SIZE`.
1797        //
1798        // 2. `next` is NULL which is trivially init.
1799        //
1800        // 3. `dst_chunk` points to unused data as the space was allocated before the loop
1801        //    or was `next_dst_chunk` in the previous iteration and hasn't been written to yet.
1802        unsafe { dst_var.write_chunk_to_granule(data, data.len(), dst_chunk, PageOffset::VAR_LEN_NULL) };
1803
1804        // For a large blob object,
1805        // notify the `blob_store` that we've taken a reference to the blob hash.
1806        if src_vlr.is_large_blob() {
1807            blob_store
1808                .clone_blob(&src_chunk.blob_hash())
1809                .expect("blob_store could not mark hash as used");
1810        }
1811
1812        Ok(VarLenRef {
1813            first_granule: copied_head,
1814            length_in_bytes: src_vlr.length_in_bytes,
1815        })
1816    }
1817
1818    /// Make `self` empty, removing all rows from it and resetting the high water marks to zero.
1819    ///
1820    /// This also clears the `unmodified_hash`.
1821    pub fn clear(&mut self) {
1822        self.header.clear();
1823    }
1824
1825    /// Zeroes every byte of row data in this page.
1826    ///
1827    /// # Safety
1828    ///
1829    /// Causes the page header to no longer match the contents, invalidating many assumptions.
1830    /// Should be called in conjunction with [`Self::clear`].
1831    pub unsafe fn zero_data(&mut self) {
1832        self.row_data.fill(0);
1833    }
1834
1835    /// Resets this page for reuse of its allocation.
1836    ///
1837    /// The reset page supports `max_rows_in_page` at most.
1838    pub fn reset_for(&mut self, max_rows_in_page: usize) {
1839        self.header.reset_for(max_rows_in_page);
1840
1841        // NOTE(centril): We previously zeroed pages when resetting.
1842        // This had an adverse performance impact.
1843        // The reason why we previously zeroed was for security under a multi-tenant setup
1844        // when exposing a module ABI that allows modules to memcpy whole pages over.
1845        // However, we have no such ABI for the time being, so we can soundly avoid zeroing.
1846        // If we ever decide to add such an ABI, we must start zeroing again.
1847        //
1848        // // SAFETY: We just reset the page header.
1849        // unsafe { self.zero_data() };
1850    }
1851
1852    /// Sets the header and the row data.
1853    ///
1854    /// # Safety
1855    ///
1856    /// The `header` and `row_data` must be consistent with each other.
1857    pub(super) unsafe fn set_raw(&mut self, header: PageHeader, row_data: RowData) {
1858        self.header = header;
1859        self.row_data = row_data;
1860    }
1861
1862    /// Returns the page header, for testing.
1863    #[cfg(test)]
1864    pub(super) fn page_header_for_test(&self) -> &PageHeader {
1865        &self.header
1866    }
1867
1868    /// Computes the content hash of this page.
1869    pub fn content_hash(&self) -> blake3::Hash {
1870        let mut hasher = blake3::Hasher::new();
1871
1872        // Hash the page contents.
1873        hasher.update(&self.row_data);
1874
1875        // Hash the `FixedHeader`, first copy out the fixed part save for the bitset into an array.
1876        let fixed = &self.header.fixed;
1877        let mut fixed_bytes = [0u8; 6];
1878        fixed_bytes[0..2].copy_from_slice(&fixed.next_free.next.0.to_le_bytes());
1879        fixed_bytes[2..4].copy_from_slice(&fixed.last.0.to_le_bytes());
1880        fixed_bytes[4..6].copy_from_slice(&fixed.num_rows.to_le_bytes());
1881        hasher.update(&fixed_bytes);
1882
1883        // Hash the fixed bit set.
1884        hasher.update(bytemuck::must_cast_slice(fixed.present_rows.storage()));
1885
1886        // Hash the `VarHeader`.
1887        hasher.update(bytemuck::bytes_of(&self.header.var));
1888
1889        // We're done.
1890        // Note that `unmodified_hash` itself must not be hashed to avoid a recursive dependency.
1891        hasher.finalize()
1892    }
1893
1894    /// Computes the content hash of this page and saves it to [`PageHeader::unmodified_hash`].
1895    pub fn save_content_hash(&mut self) {
1896        let hash = self.content_hash();
1897        self.header.unmodified_hash = Some(hash);
1898    }
1899
1900    /// Return the page's content hash, computing and saving it if it is not already stored.
1901    pub fn save_or_get_content_hash(&mut self) -> blake3::Hash {
1902        self.unmodified_hash().copied().unwrap_or_else(|| {
1903            self.save_content_hash();
1904            self.header.unmodified_hash.unwrap()
1905        })
1906    }
1907
1908    /// Returns the stored unmodified hash, if any.
1909    pub fn unmodified_hash(&self) -> Option<&blake3::Hash> {
1910        self.header.unmodified_hash.as_ref()
1911    }
1912}
1913
1914/// An iterator over the `PageOffset`s of all present fixed-length rows in a [`Page`].
1915pub struct FixedLenRowsIter<'page> {
1916    /// The fixed header of the page,
1917    /// used to determine where the last fixed row is
1918    /// and whether the fixed row slot is actually a fixed row.
1919    idx_iter: IterSet<'page>,
1920    /// The size of a row in bytes.
1921    fixed_row_size: Size,
1922}
1923
1924impl Iterator for FixedLenRowsIter<'_> {
1925    type Item = PageOffset;
1926
1927    fn next(&mut self) -> Option<Self::Item> {
1928        self.idx_iter
1929            .next()
1930            .map(|idx| PageOffset(idx as u16 * self.fixed_row_size.0))
1931    }
1932}
1933
1934/// An iterator over the [`VarLenGranule`]s in a particular [`VarLenRef`] in `page`.
1935///
1936/// Constructing a `VarLenGranulesIter` should be considered unsafe
1937/// because the initial `next_granule` must either be `NULL` or point to a valid [`VarLenGranule`].
1938///
1939/// Iterating over [`VarLenRef::NULL`] is safe and will immediately return `None`.
1940#[derive(Clone, Copy)]
1941struct VarLenGranulesIter<'page> {
1942    /// The page to yield granules from.
1943    page: &'page Page,
1944    /// Location of the next granule in `page`.
1945    /// Must either be `NULL` or point to a valid granule.
1946    next_granule: PageOffset,
1947    // TODO(perf,bikeshedding): store length and implement `Iterator::size_hint`?
1948}
1949
1950impl<'page> Iterator for VarLenGranulesIter<'page> {
1951    type Item = &'page VarLenGranule;
1952
1953    fn next(&mut self) -> Option<Self::Item> {
1954        if self.next_granule.is_var_len_null() {
1955            return None;
1956        }
1957
1958        // SAFETY: By construction,
1959        // the initial `next_granule` was promised to either be `NULL` or point to a valid granule.
1960        // For a given granule, the same applies to its `.next()` granule.
1961        // At this point, we've excluded `NULL`,
1962        // so we know inductively that `next_granule` points to a valid granule, as required.
1963        let granule: &VarLenGranule = unsafe { get_ref(&self.page.row_data, self.next_granule) };
1964        self.next_granule = granule.header.next();
1965
1966        Some(granule)
1967    }
1968}
1969
1970#[cfg(test)]
1971pub(crate) mod tests {
1972    use super::*;
1973    use crate::{
1974        blob_store::NullBlobStore, layout::row_size_for_type, page_pool::PagePool, var_len::AlignedVarLenOffsets,
1975    };
1976    use proptest::{collection::vec, prelude::*};
1977    use spacetimedb_lib::bsatn;
1978
1979    fn u64_row_size() -> Size {
1980        let fixed_row_size = row_size_for_type::<u64>();
1981        assert_eq!(fixed_row_size.len(), 8);
1982        fixed_row_size
1983    }
1984
1985    const U64_VL_VISITOR: AlignedVarLenOffsets<'_> = AlignedVarLenOffsets::from_offsets(&[]);
1986    fn u64_var_len_visitor() -> &'static AlignedVarLenOffsets<'static> {
1987        &U64_VL_VISITOR
1988    }
1989
1990    fn insert_u64(page: &mut Page, val: u64) -> PageOffset {
1991        let val_slice = val.to_le_bytes();
1992        unsafe { page.insert_row(&val_slice, &[] as &[&[u8]], u64_var_len_visitor(), &mut NullBlobStore) }
1993            .expect("Failed to insert first row")
1994    }
1995
1996    fn insert_u64_drop(page: &mut Page, val: u64) {
1997        insert_u64(page, val);
1998    }
1999
2000    fn read_u64(page: &Page, offset: PageOffset) -> u64 {
2001        let row = page.get_row_data(offset, u64_row_size());
2002        u64::from_le_bytes(row.try_into().unwrap())
2003    }
2004
2005    fn data_sub_n_vlg(n: usize) -> usize {
2006        PageOffset::PAGE_END.idx() - (VarLenGranule::SIZE * n).len()
2007    }
2008
2009    pub(crate) fn hash_unmodified_save_get(page: &mut Page) -> blake3::Hash {
2010        assert_eq!(page.header.unmodified_hash, None);
2011        page.save_content_hash();
2012        page.header.unmodified_hash.unwrap()
2013    }
2014
2015    #[test]
2016    fn insert_one_u64() {
2017        let mut page = Page::new(u64_row_size());
2018
2019        // First the hash is not saved, so compute it.
2020        let hash = hash_unmodified_save_get(&mut page);
2021
2022        let val: u64 = 0xa5a5_a5a5_a5a5_a5a5;
2023
2024        let offset = insert_u64(&mut page, val);
2025
2026        assert_eq!(offset.idx(), 0);
2027
2028        let row_val = read_u64(&page, offset);
2029
2030        assert_eq!(row_val, val);
2031
2032        // The hash should have been cleared.
2033        assert_ne!(hash, hash_unmodified_save_get(&mut page));
2034    }
2035
2036    fn insert_while(
2037        page: &mut Page,
2038        mut next_val: u64,
2039        fixed_row_size: Size,
2040        vl_num: usize,
2041        mut insert: impl FnMut(&mut Page, u64),
2042    ) -> u64 {
2043        while page.has_space_for_row(fixed_row_size, vl_num) {
2044            insert(page, next_val);
2045            next_val += 1;
2046        }
2047        next_val
2048    }
2049
2050    #[test]
2051    fn fill_then_iter_fixed_len_u64() {
2052        let mut page = Page::new(u64_row_size());
2053
2054        let last_val = insert_while(&mut page, 0, u64_row_size(), 0, insert_u64_drop);
2055        assert_eq!(last_val, (PageOffset::PAGE_END / u64_row_size()) as u64);
2056
2057        for (row_idx, expected_val) in page.iter_fixed_len(u64_row_size()).zip(0..last_val) {
2058            let row_val = read_u64(&page, row_idx);
2059            assert_eq!(
2060                row_val, expected_val,
2061                "row_val {:x} /= expected_val {:x}",
2062                row_val, expected_val
2063            );
2064        }
2065    }
2066
2067    #[test]
2068    fn fill_delete_iter_fixed_len_u64() {
2069        let mut page = Page::new(u64_row_size());
2070
2071        // First the hash is not saved, so compute it.
2072        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2073
2074        // Insert rows.
2075        let mut odds: Vec<PageOffset> = Vec::new();
2076        let last_val = insert_while(&mut page, 2, u64_row_size(), 0, |page, val| {
2077            let offset = insert_u64(page, val);
2078            if val % 2 == 1 {
2079                odds.push(offset);
2080            }
2081        });
2082
2083        // The hash should have been cleared.
2084        let hash_pre_del = hash_unmodified_save_get(&mut page);
2085        assert_ne!(hash_pre_ins, hash_pre_del);
2086
2087        // Delete rows.
2088        for row_offset in odds {
2089            unsafe { page.delete_row(row_offset, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2090        }
2091
2092        // The hash should have been cleared.
2093        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2094        assert_ne!(hash_pre_ins, hash_pre_iter);
2095        assert_ne!(hash_pre_del, hash_pre_iter);
2096
2097        // Iterate the rows.
2098        for (row_offset, expected_val) in page.iter_fixed_len(u64_row_size()).zip((2..last_val).step_by(2)) {
2099            let found_val = read_u64(&page, row_offset);
2100            assert_eq!(found_val, expected_val);
2101        }
2102
2103        // Hash is unchanged.
2104        assert_eq!(page.header.unmodified_hash, Some(hash_pre_iter));
2105    }
2106
2107    #[test]
2108    /// After deleting a fixed-length row and then inserting a new fixed-length row,
2109    /// the fixed-length high water mark must not change,
2110    /// i.e. we must re-use memory from the deleted row to store the new insertion.
2111    fn reuse_fixed_len_space() {
2112        let mut page = Page::new(u64_row_size());
2113
2114        // First the hash is not saved, so compute it.
2115        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2116
2117        // Insert two rows.
2118        let offset_0 = insert_u64(&mut page, 0xa5a5a5a5_a5a5a5a5);
2119        assert_eq!(offset_0.idx(), 0);
2120        let offset_1 = insert_u64(&mut page, 0xbeefbeef_beefbeef);
2121        assert_eq!(offset_1, u64_row_size());
2122
2123        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2124
2125        // Hash has been cleared after inserts.
2126        let hash_pre_del = hash_unmodified_save_get(&mut page);
2127        assert_ne!(hash_pre_ins, hash_pre_del);
2128
2129        // Delete first row.
2130        unsafe { page.delete_row(offset_0, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2131
2132        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2133
2134        // Hash has been cleared after deletes.
2135        let hash_pre_ins2 = hash_unmodified_save_get(&mut page);
2136        assert_ne!(hash_pre_ins, hash_pre_ins2);
2137        assert_ne!(hash_pre_del, hash_pre_ins2);
2138
2139        // Insert first row again, re-using memory.
2140        let offset_0_again = insert_u64(&mut page, 0xffffffff_ffffffff);
2141
2142        assert_eq!(offset_0_again.idx(), 0);
2143        assert_eq!(offset_0.idx(), offset_0_again.idx());
2144
2145        assert_eq!(page.header.fixed.last, u64_row_size() * 2);
2146
2147        // Hash has been cleared after last insert, despite re-using memory.
2148        let hash_post_ins2 = hash_unmodified_save_get(&mut page);
2149        assert_ne!(hash_pre_ins, hash_post_ins2);
2150        assert_ne!(hash_pre_del, hash_post_ins2);
2151        assert_ne!(hash_pre_ins2, hash_post_ins2);
2152    }
2153
2154    const STR_ROW_SIZE: Size = row_size_for_type::<VarLenRef>();
2155
2156    const _: () = assert!(STR_ROW_SIZE.len() == mem::size_of::<VarLenRef>());
2157
2158    const STR_VL_VISITOR: AlignedVarLenOffsets<'_> = AlignedVarLenOffsets::from_offsets(&[0]);
2159    fn str_var_len_visitor() -> &'static AlignedVarLenOffsets<'static> {
2160        &STR_VL_VISITOR
2161    }
2162
2163    fn insert_str(page: &mut Page, data: &[u8]) -> PageOffset {
2164        let fixed_len_data = [0u8; STR_ROW_SIZE.len()];
2165        unsafe { page.insert_row(&fixed_len_data, &[data], str_var_len_visitor(), &mut NullBlobStore) }
2166            .expect("Failed to insert row")
2167    }
2168
2169    fn read_str_ref(page: &Page, offset: PageOffset) -> VarLenRef {
2170        *unsafe { get_ref(&page.row_data, offset) }
2171    }
2172
2173    #[test]
2174    fn insert_empty_str() {
2175        let mut page = Page::new(STR_ROW_SIZE);
2176
2177        // First the hash is not saved, so compute it.
2178        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2179
2180        // Insert the empty string.
2181        let offset = insert_str(&mut page, &[]);
2182
2183        // No granules were used.
2184        let extracted = read_str_ref(&page, offset);
2185        let mut granules_iter = unsafe { page.iter_var_len_object(extracted.first_granule) };
2186        assert!(granules_iter.next().is_none());
2187        drop(granules_iter);
2188
2189        // Hash is cleared even though the string was empty.
2190        assert_ne!(hash_pre_ins, hash_unmodified_save_get(&mut page));
2191    }
2192
2193    proptest! {
2194        #[test]
2195        fn insert_one_short_str(data in vec(any::<u8>(), 1..VarLenGranule::DATA_SIZE)) {
2196            let mut page = Page::new(STR_ROW_SIZE);
2197
2198            // First the hash is not saved, so compute it.
2199            let hash_pre_ins = hash_unmodified_save_get(&mut page);
2200
2201            // Insert the row.
2202            let offset = insert_str(&mut page, &data);
2203
2204            // Hash was cleared by the insert.
2205            let hash_pre_iter = hash_unmodified_save_get(&mut page);
2206            assert_ne!(hash_pre_ins, hash_pre_iter);
2207
2208            // Confirm we inserted correctly.
2209            let extracted = read_str_ref(&page, offset);
2210            let mut data_iter = unsafe { page.iter_vlo_data(extracted.first_granule) };
2211            let (first, next) = (data_iter.next(), data_iter.next());
2212            assert_eq!(first, Some(&*data));
2213            assert_eq!(next, None);
2214
2215            // Iteration and reading did not change the hash.
2216            assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2217        }
2218
2219        #[test]
2220        fn insert_one_long_str(data in vec(any::<u8>(), (VarLenGranule::OBJECT_SIZE_BLOB_THRESHOLD / 2)..VarLenGranule::OBJECT_SIZE_BLOB_THRESHOLD)) {
2221            let mut page = Page::new(STR_ROW_SIZE);
2222
2223            // First the hash is not saved, so compute it.
2224            let hash_pre_ins = hash_unmodified_save_get(&mut page);
2225
2226            // Insert the long string.
2227            let offset = insert_str(&mut page, &data);
2228
2229            // The hash was cleared, and the new one is different.
2230            let hash_post_ins = hash_unmodified_save_get(&mut page);
2231            assert_ne!(hash_pre_ins, hash_post_ins);
2232
2233            // Check that we inserted correctly.
2234            let extracted = read_str_ref(&page, offset);
2235
2236            let mut data_iter = unsafe { page.iter_vlo_data(extracted.first_granule) };
2237            let mut chunks_iter = data.chunks(VarLenGranule::DATA_SIZE);
2238
2239            for (i, (data, chunk)) in (&mut data_iter).zip(&mut chunks_iter).enumerate() {
2240                assert_eq!(
2241                    data,
2242                    chunk,
2243                    "Chunk {} does not match. Left is found, right is expected.",
2244                    i,
2245                );
2246            }
2247
2248            // Both iterators must be finished, i.e. they must have the same length.
2249            assert!(data_iter.next().is_none());
2250            assert!(chunks_iter.next().is_none());
2251
2252            // Reading did not alter the hash.
2253            assert_eq!(hash_post_ins, page.header.unmodified_hash.unwrap());
2254        }
2255    }
2256
2257    #[test]
2258    fn reuse_var_len_space_no_fragmentation_concerns() {
2259        let data_0 = b"Hello, world!";
2260        let data_1 = b"How goes life?";
2261        let data_2 = b"Glad to hear it.";
2262
2263        let mut page = Page::new(STR_ROW_SIZE);
2264        let offset_0 = insert_str(&mut page, data_0);
2265        let offset_1 = insert_str(&mut page, data_1);
2266
2267        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2268
2269        assert_ne!(offset_0.idx(), offset_1.idx());
2270
2271        let var_len_0 = read_str_ref(&page, offset_0);
2272
2273        assert_eq!(var_len_0.length_in_bytes as usize, data_0.len());
2274        assert_eq!(var_len_0.first_granule.idx(), data_sub_n_vlg(1));
2275
2276        let var_len_1 = read_str_ref(&page, offset_1);
2277
2278        assert_eq!(var_len_1.length_in_bytes as usize, data_1.len());
2279        assert_eq!(var_len_1.first_granule.idx(), data_sub_n_vlg(2));
2280
2281        let hash_pre_del = hash_unmodified_save_get(&mut page);
2282
2283        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2284
2285        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2286
2287        let offset_2 = insert_str(&mut page, data_2);
2288
2289        let hash_post_ins = hash_unmodified_save_get(&mut page);
2290        assert_ne!(hash_pre_del, hash_pre_ins);
2291        assert_ne!(hash_pre_del, hash_post_ins);
2292        assert_ne!(hash_pre_ins, hash_post_ins);
2293
2294        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2295
2296        assert_eq!(offset_0.idx(), offset_2.idx());
2297
2298        let var_len_2 = read_str_ref(&page, offset_2);
2299
2300        assert_eq!(var_len_2.length_in_bytes as usize, data_2.len());
2301        assert_eq!(var_len_2.first_granule.idx(), var_len_0.first_granule.idx());
2302    }
2303
2304    #[test]
2305    fn free_var_len_obj_multiple_granules() {
2306        let mut page = Page::new(STR_ROW_SIZE);
2307
2308        // Allocate a 4-granule var-len object.
2309        let data_0 = [0xa5u8].repeat(VarLenGranule::DATA_SIZE * 4);
2310        let offset_0 = insert_str(&mut page, &data_0);
2311
2312        let var_len_0 = read_str_ref(&page, offset_0);
2313
2314        // Read the addresses of its var-len granules.
2315        let granules_0 = unsafe { page.iter_var_len_object(var_len_0.first_granule) }
2316            .map(|granule| granule as *const VarLenGranule as usize)
2317            .collect::<Vec<_>>();
2318
2319        // Sanity checks: we have allocated 4 granules.
2320        assert_eq!(granules_0.len(), 4);
2321        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(4));
2322
2323        // Delete the row.
2324        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2325
2326        // Allocate a new 4-granule var-len object.
2327        // This should use the same storage as the original row.
2328        let data_1 = [0xffu8].repeat(VarLenGranule::DATA_SIZE * 4);
2329        let offset_1 = insert_str(&mut page, &data_1);
2330
2331        let var_len_1 = read_str_ref(&page, offset_1);
2332
2333        // Read the addresses of the new allocation's var-len granules.
2334        let granules_1 = unsafe { page.iter_var_len_object(var_len_1.first_granule) }
2335            .map(|granule| granule as *const VarLenGranule as usize)
2336            .collect::<Vec<_>>();
2337
2338        // Sanity check: the new allocation is also 4 granules.
2339        assert_eq!(granules_1.len(), 4);
2340
2341        for granule in granules_1.iter().copied() {
2342            // The new var-len allocation must contain all the same granules by address
2343            // as the old var-len allocation.
2344            assert!(granules_0.iter().copied().any(|other_granule| other_granule == granule));
2345        }
2346
2347        // The var-len high water mark must not have moved.
2348        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(4));
2349    }
2350
2351    #[test]
2352    fn reuse_var_len_space_avoid_fragmentation() {
2353        let data_0 = &[0xa5u8];
2354        let data_1 = &[0xffu8];
2355        let data_2 = [0x11u8].repeat(VarLenGranule::DATA_SIZE + 1);
2356        let data_2 = data_2.as_ref();
2357
2358        let mut page = Page::new(STR_ROW_SIZE);
2359
2360        // First the hash is not saved, so compute it.
2361        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2362
2363        // Insert two string rows.
2364        let offset_0 = insert_str(&mut page, data_0);
2365        let _offset_1 = insert_str(&mut page, data_1);
2366
2367        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(2));
2368
2369        // Hash is cleared by inserting and the new one is different.
2370        let hash_pre_del = hash_unmodified_save_get(&mut page);
2371        assert_ne!(hash_pre_ins, hash_pre_del);
2372
2373        // Delete the first row.
2374        unsafe { page.delete_row(offset_0, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2375
2376        // Hash is cleared by deleting.
2377        let hash_post_del = hash_unmodified_save_get(&mut page);
2378        assert_ne!(hash_pre_ins, hash_post_del);
2379        assert_ne!(hash_pre_del, hash_post_del);
2380
2381        // Insert again, re-using memory.
2382        let offset_2 = insert_str(&mut page, data_2);
2383
2384        assert_eq!(page.header.var.first.idx(), data_sub_n_vlg(3));
2385
2386        // Hash is cleared by inserting again, even though we re-used memory.
2387        let hash_post_ins2 = hash_unmodified_save_get(&mut page);
2388        assert_ne!(hash_pre_ins, hash_post_ins2);
2389        assert_ne!(hash_pre_del, hash_post_ins2);
2390        assert_ne!(hash_post_del, hash_post_ins2);
2391
2392        // Check that we inserted correctly.
2393        let var_len_2 = read_str_ref(&page, offset_2);
2394
2395        let mut data_iter = unsafe { page.iter_vlo_data(var_len_2.first_granule) };
2396        let mut chunks_iter = data_2.chunks(VarLenGranule::DATA_SIZE);
2397
2398        for (i, (data, chunk)) in (&mut data_iter).zip(&mut chunks_iter).enumerate() {
2399            assert_eq!(
2400                data, chunk,
2401                "Chunk {} does not match. Left is found, right is expected.",
2402                i,
2403            );
2404        }
2405
2406        // Both iterators must be finished, i.e. they must have the same length.
2407        assert!(data_iter.next().is_none());
2408        assert!(chunks_iter.next().is_none());
2409    }
2410
2411    fn check_u64_in_str(page: &Page, row_idx: PageOffset, expected_val: u64) {
2412        let vlr = read_str_ref(page, row_idx);
2413
2414        let mut var_len_iter = unsafe { page.iter_vlo_data(vlr.first_granule) };
2415        let data = var_len_iter.next().unwrap();
2416        assert!(var_len_iter.next().is_none());
2417        assert_eq!(data.len(), mem::size_of::<u64>());
2418
2419        let val = u64::from_le_bytes(data.try_into().unwrap());
2420        assert_eq!(val, expected_val);
2421    }
2422
2423    #[test]
2424    fn fill_then_iter_var_len_str() {
2425        let mut page = Page::new(STR_ROW_SIZE);
2426
2427        // First the hash is not saved, so compute it.
2428        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2429
2430        // Insert the strings.
2431        let last_val = insert_while(&mut page, 0, STR_ROW_SIZE, 1, |page, val| {
2432            insert_str(page, &val.to_le_bytes());
2433        });
2434
2435        // Hash is cleared by inserting and the new one is different.
2436        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2437        assert_ne!(hash_pre_ins, hash_pre_iter);
2438
2439        // Check that we inserted correctly.
2440        let size_per_row = STR_ROW_SIZE + VarLenGranule::SIZE;
2441
2442        assert_eq!(last_val, (PageOffset::PAGE_END / size_per_row) as u64);
2443
2444        for (row_idx, expected_val) in page.iter_fixed_len(STR_ROW_SIZE).zip(0..last_val) {
2445            check_u64_in_str(&page, row_idx, expected_val);
2446        }
2447
2448        // Reading does not alter the hash.
2449        assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2450    }
2451
2452    #[test]
2453    fn fill_delete_iter_var_len_str() {
2454        let mut page = Page::new(STR_ROW_SIZE);
2455
2456        // First the hash is not saved, so compute it.
2457        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2458
2459        // Insert the string rows.
2460        let mut odds = Vec::new();
2461        let last_val = insert_while(&mut page, 0, STR_ROW_SIZE, 1, |page, val| {
2462            let offset = insert_str(page, &val.to_le_bytes());
2463            if val % 2 == 1 {
2464                odds.push(offset);
2465            }
2466        });
2467
2468        let size_per_row = STR_ROW_SIZE + VarLenGranule::SIZE;
2469        let num_rows_inserted = (PageOffset::PAGE_END / size_per_row) as u64;
2470        assert_eq!(last_val, num_rows_inserted);
2471
2472        // Hash was cleared by inserting and is different now.
2473        let hash_pre_del = hash_unmodified_save_get(&mut page);
2474        assert_ne!(hash_pre_ins, hash_pre_del);
2475
2476        // Delete the rows.
2477        for row_offset in odds {
2478            unsafe { page.delete_row(row_offset, STR_ROW_SIZE, str_var_len_visitor(), &mut NullBlobStore) };
2479        }
2480
2481        // Hash was cleared by deleting and is different now.
2482        let hash_pre_iter = hash_unmodified_save_get(&mut page);
2483        assert_ne!(hash_pre_ins, hash_pre_iter);
2484        assert_ne!(hash_pre_del, hash_pre_iter);
2485
2486        // Check that we deleted correctly.
2487        let num_rows_retained = num_rows_inserted.div_ceil(2);
2488        let num_rows_removed = num_rows_inserted / 2;
2489
2490        assert_eq!(page.header.fixed.num_rows as u64, num_rows_retained);
2491
2492        assert_eq!(page.header.var.freelist_len as u64, num_rows_removed);
2493
2494        for (row_idx, expected_val) in page.iter_fixed_len(STR_ROW_SIZE).zip((0..last_val).step_by(2)) {
2495            check_u64_in_str(&page, row_idx, expected_val);
2496        }
2497
2498        // Reading did not alter the hash.
2499        assert_eq!(hash_pre_iter, page.header.unmodified_hash.unwrap());
2500    }
2501
2502    #[test]
2503    fn serde_round_trip_whole_page() {
2504        let pool = PagePool::new_for_test();
2505        let mut page = Page::new(u64_row_size());
2506
2507        // Construct an empty page, ser/de it, and assert that it's still empty.
2508        let hash_pre_ins = hash_unmodified_save_get(&mut page);
2509        let ser_pre_ins = bsatn::to_vec(&page).unwrap();
2510        let de_pre_ins = pool.take_deserialize_from(&ser_pre_ins).unwrap();
2511        assert_eq!(de_pre_ins.content_hash(), hash_pre_ins);
2512        assert_eq!(de_pre_ins.header.fixed.num_rows, 0);
2513        assert!(de_pre_ins.header.fixed.present_rows == page.header.fixed.present_rows);
2514
2515        // Insert some rows into the page.
2516        let offsets = (0..64)
2517            .map(|val| insert_u64(&mut page, val))
2518            .collect::<Vec<PageOffset>>();
2519
2520        let hash_ins = hash_unmodified_save_get(&mut page);
2521
2522        // Ser/de the page and assert that it contains the same rows.
2523        let ser_ins = bsatn::to_vec(&page).unwrap();
2524        let de_ins = pool.take_deserialize_from(&ser_ins).unwrap();
2525        assert_eq!(de_ins.content_hash(), hash_ins);
2526        assert_eq!(de_ins.header.fixed.num_rows, 64);
2527        assert!(de_ins.header.fixed.present_rows == page.header.fixed.present_rows);
2528        assert_eq!(
2529            de_ins.iter_fixed_len(u64_row_size()).collect::<Vec<PageOffset>>(),
2530            offsets
2531        );
2532
2533        // Delete the even-numbered rows, leaving the odds.
2534        let offsets = offsets
2535            .into_iter()
2536            .enumerate()
2537            .filter_map(|(i, offset)| {
2538                if i % 2 == 0 {
2539                    unsafe { page.delete_row(offset, u64_row_size(), u64_var_len_visitor(), &mut NullBlobStore) };
2540                    None
2541                } else {
2542                    Some(offset)
2543                }
2544            })
2545            .collect::<Vec<PageOffset>>();
2546
2547        // Ser/de the page again and assert that it contains only the odd-numbered rows.
2548        let hash_del = hash_unmodified_save_get(&mut page);
2549        let ser_del = bsatn::to_vec(&page).unwrap();
2550        let de_del = pool.take_deserialize_from(&ser_del).unwrap();
2551        assert_eq!(de_del.content_hash(), hash_del);
2552        assert_eq!(de_del.header.fixed.num_rows, 32);
2553        assert!(de_del.header.fixed.present_rows == page.header.fixed.present_rows);
2554        assert_eq!(
2555            de_del.iter_fixed_len(u64_row_size()).collect::<Vec<PageOffset>>(),
2556            offsets
2557        );
2558    }
2559}