libblobd_lite/page/
mod.rs

1#[cfg(test)]
2pub mod tests;
3
4#[cfg(test)]
5use crate::test_util::journal::TestTransaction as Transaction;
6#[cfg(test)]
7use crate::test_util::journal::TestWriteJournal as WriteJournal;
8use crate::util::div_mod_pow2;
9use crate::util::div_pow2;
10use crate::util::floor_pow2;
11use crate::util::mod_pow2;
12use num_derive::FromPrimitive;
13use num_traits::FromPrimitive;
14use off64::int::create_u64_be;
15use off64::int::Off64ReadInt;
16use off64::int::Off64WriteMutInt;
17use off64::usz;
18use std::sync::Arc;
19use tracing::info;
20#[cfg(not(test))]
21use write_journal::Transaction;
22#[cfg(not(test))]
23use write_journal::WriteJournal;
24
25pub(crate) const MIN_PAGE_SIZE_POW2: u8 = 8;
26pub(crate) const MAX_PAGE_SIZE_POW2: u8 = 32;
27
28// Bytes to reserve per page.
29const PAGE_HEADER_CAP_POW2: u8 = 4;
30pub(crate) const PAGE_HEADER_CAP: u64 = 1 << PAGE_HEADER_CAP_POW2;
31
32const FREE_PAGE_OFFSETOF_PREV: u64 = 0;
33const FREE_PAGE_OFFSETOF_NEXT: u64 = FREE_PAGE_OFFSETOF_PREV + 5;
34
35const OBJECT_OFFSETOF_PREV: u64 = 0;
36const OBJECT_OFFSETOF_NEXT: u64 = OBJECT_OFFSETOF_PREV + 5;
37const OBJECT_OFFSETOF_DELETED_SEC: u64 = OBJECT_OFFSETOF_NEXT + 5;
38const OBJECT_OFFSETOF_META_SIZE_AND_STATE: u64 = OBJECT_OFFSETOF_DELETED_SEC + 5;
39
40pub(crate) trait PageHeader {
41  /// Many device offsets can be reduced by 1 byte by right shifting by MIN_PAGE_SIZE_POW2 because it can't refer to anything more granular than a page. Remember to left shift when deserialising.
42  fn serialize(&self, out: &mut [u8]);
43  fn deserialize(raw: &[u8]) -> Self;
44}
45
46pub(crate) struct FreePagePageHeader {
47  pub prev: u64,
48  pub next: u64,
49}
50
51impl PageHeader for FreePagePageHeader {
52  fn serialize(&self, out: &mut [u8]) {
53    out.write_u40_be_at(FREE_PAGE_OFFSETOF_PREV, self.prev >> MIN_PAGE_SIZE_POW2);
54    out.write_u40_be_at(FREE_PAGE_OFFSETOF_NEXT, self.next >> MIN_PAGE_SIZE_POW2);
55  }
56
57  fn deserialize(raw: &[u8]) -> Self {
58    Self {
59      prev: raw.read_u40_be_at(FREE_PAGE_OFFSETOF_PREV) << MIN_PAGE_SIZE_POW2,
60      next: raw.read_u40_be_at(FREE_PAGE_OFFSETOF_NEXT) << MIN_PAGE_SIZE_POW2,
61    }
62  }
63}
64
65#[derive(PartialEq, Eq, Clone, Copy, Debug, FromPrimitive)]
66#[repr(u8)]
67pub(crate) enum ObjectState {
68  // Avoid 0 to detect uninitialised/missing/corrupt state.
69  Incomplete = 1,
70  Committed,
71  Deleted,
72}
73
74pub(crate) struct ObjectPageHeader {
75  // Device offset of prev/next inode in incomplete/deleted/bucket linked list, or zero if tail.
76  pub prev: u64,
77  pub next: u64,
78  // Timestamp in seconds since epoch.
79  pub deleted_sec: Option<u64>,
80  pub state: ObjectState,
81  pub metadata_size_pow2: u8,
82}
83
84impl PageHeader for ObjectPageHeader {
85  fn serialize(&self, out: &mut [u8]) {
86    out.write_u40_be_at(OBJECT_OFFSETOF_PREV, self.prev >> MIN_PAGE_SIZE_POW2);
87    out.write_u40_be_at(OBJECT_OFFSETOF_NEXT, self.next >> MIN_PAGE_SIZE_POW2);
88    out.write_u40_be_at(OBJECT_OFFSETOF_DELETED_SEC, self.deleted_sec.unwrap_or(0));
89    out[usz!(OBJECT_OFFSETOF_META_SIZE_AND_STATE)] =
90      (self.metadata_size_pow2 << 3) | (self.state as u8);
91  }
92
93  fn deserialize(raw: &[u8]) -> Self {
94    let b = raw[usz!(OBJECT_OFFSETOF_META_SIZE_AND_STATE)];
95    Self {
96      prev: raw.read_u40_be_at(OBJECT_OFFSETOF_PREV) << MIN_PAGE_SIZE_POW2,
97      next: raw.read_u40_be_at(OBJECT_OFFSETOF_NEXT) << MIN_PAGE_SIZE_POW2,
98      deleted_sec: Some(raw.read_u40_be_at(OBJECT_OFFSETOF_DELETED_SEC)).filter(|ts| *ts != 0),
99      state: ObjectState::from_u8(b & 0b111).unwrap(),
100      metadata_size_pow2: b >> 3,
101    }
102  }
103}
104
105pub(crate) struct Pages {
106  // To access the overlay.
107  journal: Arc<WriteJournal>,
108  heap_dev_offset: u64,
109  block_size_pow2: u8,
110  pages_per_lpage_pow2: u8,
111  /// WARNING: Do not modify after creation.
112  pub block_size: u64,
113  /// WARNING: Do not modify after creation.
114  pub lpage_size_pow2: u8,
115  /// WARNING: Do not modify after creation.
116  pub spage_size_pow2: u8,
117}
118
119impl Pages {
120  pub fn new(
121    journal: Arc<WriteJournal>,
122    heap_dev_offset: u64,
123    spage_size_pow2: u8,
124    lpage_size_pow2: u8,
125  ) -> Pages {
126    assert!(spage_size_pow2 >= MIN_PAGE_SIZE_POW2);
127    assert!(lpage_size_pow2 >= spage_size_pow2 && lpage_size_pow2 <= MAX_PAGE_SIZE_POW2);
128    // For fast bitwise calculations to be correct, the heap needs to be aligned to `2^lpage_size_pow2` i.e. start at an address that is a multiple of the largest page size in bytes.
129    assert_eq!(mod_pow2(heap_dev_offset, lpage_size_pow2), 0);
130    // `lpage` means a page of the largest size. `spage` means a page of the smallest size. A data lpage contains actual data, while a metadata lpage contains the free page bitmap for all pages in the following N data lpages (see following code for value of N). Both are lpages (i.e. pages of the largest page size). A data lpage can have X pages, where X is how many pages of all sizes and offsets it could have.
131    // A metadata lpage and the following data lpages it covers constitute a block.
132    // The page count per lpage is equal to `2 * (2^lpage_size_pow2 / 2^spage_size_pow2) - 1`, which is identical to `2^(1 + lpage_size_pow2 - spage_size_pow2) - 1`, but we add one to keep it a power of two. It's equivalent to the count of nodes in a full binary tree, where leaf nodes are the spages and the root node is the sole lpage.
133    let pages_per_lpage_pow2 = 1 + lpage_size_pow2 - spage_size_pow2;
134    // Calculate how many data lpages we can track in one metadata lpage; we need one bit to track one page. This is equal to `2^lpage_size_pow2 * 8 / 2^pages_per_lpage_pow2`, which is identical to `2^(lpage_size_pow2 + 3 - pages_per_lpage_pow2)`.
135    let lpages_per_block_pow2 = lpage_size_pow2 + 3 - pages_per_lpage_pow2;
136    // To keep calculations fast, we only store for the next N data lpages where N is a power of two minus one. This way, any device offset in those data lpages can get the device offset of the start of their corresponding metadata lpage with a simple bitwise AND, and the size of a block is simply `2^data_lpages_max_pow2 * 2^lpage_size_pow2` since one data lpage is effectively replaced with a metadata lpage. However, this does mean that the metadata lpage wastes some space.
137    let block_size_pow2 = lpages_per_block_pow2 + lpage_size_pow2;
138    let block_size = 1 << block_size_pow2;
139    info!(block_size, "page config");
140    Pages {
141      block_size_pow2,
142      block_size,
143      heap_dev_offset,
144      journal,
145      lpage_size_pow2,
146      pages_per_lpage_pow2,
147      spage_size_pow2,
148    }
149  }
150
151  #[allow(unused)]
152  pub fn spage_size(&self) -> u64 {
153    1 << self.spage_size_pow2
154  }
155
156  #[allow(unused)]
157  pub fn lpage_size(&self) -> u64 {
158    1 << self.lpage_size_pow2
159  }
160
161  fn assert_valid_page_dev_offset(&self, page_dev_offset: u64) {
162    // Must be in the heap.
163    assert!(
164      page_dev_offset >= self.heap_dev_offset,
165      "page dev offset {page_dev_offset} is not in the heap"
166    );
167    // Must not be in a metadata lpage. Note that the heap is only aligned to lpage, not an entire block, so we must first subtract the heap dev offset.
168    assert!(
169      mod_pow2(page_dev_offset - self.heap_dev_offset, self.block_size_pow2) >= self.lpage_size(),
170      "page dev offset {page_dev_offset} is in a metadata lpage"
171    );
172    // Must be a multiple of the spage size.
173    assert_eq!(
174      mod_pow2(page_dev_offset, self.spage_size_pow2),
175      0,
176      "page dev offset {page_dev_offset} is not a multiple of spage"
177    );
178  }
179
180  fn get_page_free_bit_offset(&self, page_dev_offset: u64, page_size_pow2: u8) -> (u64, u64) {
181    self.assert_valid_page_dev_offset(page_dev_offset);
182    // Heap is only aligned to lpage, not an entire block, so we must add/subtract the heap dev offset.
183    let block_dev_offset = self.heap_dev_offset
184      + floor_pow2(page_dev_offset - self.heap_dev_offset, self.block_size_pow2);
185    // We store our bitmap like a binary tree, where the root node is the lpage, and leaf nodes are spages.
186    // Let's assume each bit is like an array element with a u64 index, and the array starts at index 1. Then, given `spage_pow2 == 2` and `lpage_pow2 == 16`:
187    // Index 1  (0b0001) = page16_0 => page_pow2 == lpage_pow2 - (63 - lead_zeros) = 16 - (63 - 63) = 16
188    // Index 2  (0b0010) = page15_0 => page_pow2 == lpage_pow2 - (63 - lead_zeros) = 16 - (63 - 62) = 15
189    // Index 3  (0b0011) = page15_1 => page_pow2 == lpage_pow2 - (63 - lead_zeros) = 16 - (63 - 62) = 15
190    // Index 4  (0b0100) = page14_0 => page_pow2 == lpage_pow2 - (63 - lead_zeros) = 16 - (63 - 61) = 14
191    // Index 5  (0b0101) = page14_1 => ...
192    // Index 6  (0b0110) = page14_2
193    // Index 7  (0b0111) = page14_3
194    // Index 8  (0b1000) = page13_0
195    // Index 9  (0b1001) = page13_1
196    // Index 10 (0b1010) = page13_2
197    // ...
198    // Based on the above pattern, we can determine that the index for a page of size 2^S and zero-based position N,
199    // the 1-based index I is `(1 << (lpage_pow2 - S)) | N`, where `N = (page_dev_offset - lpage_dev_offset) / 2^S`.
200    // Now:
201    // - We have multiple lpages per block, not just one.
202    // - We split our bitmaps into 64-bit integers to reduce load on overlay bucket.
203    // - Indices are 0-based, not 1-based.
204    // So:
205    // - Get the lpage number within the block.
206    // - Get the page number within the lpage (`N`).
207    // - Get the u64 value at element `(I - 1) / 64`, which is at dev offset `metadata_lpage_dev_offset + ((I - 1) / 64) * 8`.
208    // - Get the bit at position `(I - 1) % 64`.
209    let (lpage_n, offset_within_lpage) =
210      div_mod_pow2(page_dev_offset - block_dev_offset, self.lpage_size_pow2);
211    let lpage_dev_offset = page_dev_offset - offset_within_lpage;
212    let n = div_pow2(page_dev_offset - lpage_dev_offset, page_size_pow2);
213    let i = (lpage_n * (1 << self.pages_per_lpage_pow2))
214      | (1 << (self.lpage_size_pow2 - page_size_pow2))
215      | n;
216    let (elem, bit) = div_mod_pow2(i - 1, 6);
217    let elem_dev_offset = block_dev_offset + (elem * 8);
218
219    (elem_dev_offset, bit)
220  }
221
222  pub async fn is_page_free(&self, page_dev_offset: u64, page_size_pow2: u8) -> bool {
223    let (elem_dev_offset, bit) = self.get_page_free_bit_offset(page_dev_offset, page_size_pow2);
224    let bitmap = self
225      .journal
226      .read_with_overlay(elem_dev_offset, 8)
227      .await
228      .read_u64_be_at(0);
229    (bitmap & (1 << bit)) != 0
230  }
231
232  pub async fn mark_page_as_free(
233    &self,
234    txn: &mut Transaction,
235    page_dev_offset: u64,
236    page_size_pow2: u8,
237  ) {
238    let (elem_dev_offset, bit) = self.get_page_free_bit_offset(page_dev_offset, page_size_pow2);
239    let bitmap = self
240      .journal
241      .read_with_overlay(elem_dev_offset, 8)
242      .await
243      .read_u64_be_at(0);
244    txn.write_with_overlay(elem_dev_offset, create_u64_be(bitmap | (1 << bit)));
245  }
246
247  pub async fn mark_page_as_not_free(
248    &self,
249    txn: &mut Transaction,
250    page_dev_offset: u64,
251    page_size_pow2: u8,
252  ) {
253    let (elem_dev_offset, bit) = self.get_page_free_bit_offset(page_dev_offset, page_size_pow2);
254    let bitmap = self
255      .journal
256      .read_with_overlay(elem_dev_offset, 8)
257      .await
258      .read_u64_be_at(0);
259    txn.write_with_overlay(elem_dev_offset, create_u64_be(bitmap & !(1 << bit)));
260  }
261
262  pub async fn read_page_header<H: PageHeader>(&self, page_dev_offset: u64) -> H {
263    self.assert_valid_page_dev_offset(page_dev_offset);
264    let raw = self
265      .journal
266      .read_with_overlay(page_dev_offset, PAGE_HEADER_CAP)
267      .await;
268    H::deserialize(&raw)
269  }
270
271  pub fn write_page_header<H: PageHeader>(
272    &self,
273    txn: &mut Transaction,
274    page_dev_offset: u64,
275    h: H,
276  ) {
277    self.assert_valid_page_dev_offset(page_dev_offset);
278    let mut out = vec![0u8; usz!(PAGE_HEADER_CAP)];
279    h.serialize(&mut out);
280    txn.write_with_overlay(page_dev_offset, out);
281  }
282
283  pub async fn update_page_header<H: PageHeader>(
284    &self,
285    txn: &mut Transaction,
286    page_dev_offset: u64,
287    f: impl FnOnce(&mut H) -> (),
288  ) {
289    let mut hdr = self.read_page_header(page_dev_offset).await;
290    f(&mut hdr);
291    self.write_page_header(txn, page_dev_offset, hdr);
292  }
293}