libblobd_lite/
object.rs

1use crate::allocator::Allocator;
2use crate::page::Pages;
3use crate::page::PAGE_HEADER_CAP;
4#[cfg(test)]
5use crate::test_util::device::TestSeekableAsyncFile as SeekableAsyncFile;
6#[cfg(test)]
7use crate::test_util::journal::TestTransaction as Transaction;
8use crate::util::ceil_pow2;
9use crate::util::div_mod_pow2;
10use off64::int::Off64ReadInt;
11use off64::u8;
12#[cfg(not(test))]
13use seekable_async_file::SeekableAsyncFile;
14#[cfg(not(test))]
15use write_journal::Transaction;
16
17/**
18
19Structure
20---------
21
22We cannot store head data inline as we need to know the inode size in order to allocate a fragment for it, but we cannot know any inline head data length unless we know the inode size. For simplicity and flexibility, we just use a fragment for tail data, instead of storing it inline which would make it subject to the free space within the same fragmented tile.
23
24The ordering of these fields is important (and somewhat strange/seemingly random), as we want to avoid multiple small reads.
25- **write_object** requires `size`, `obj_id`, `key_len`, `tail_data_fragment_dev_offset_or_zero_if_none`, and one `tile` element.
26- **find_inode_in_bucket** requires `obj_id`, `key_len`, `key`, and `next_inode_dev_offset_or_zero_if_end`.
27- **read_object** requires *find_inode_in_bucket* as well as `size`, `tail_data_fragment_dev_offset_or_zero_if_none`, and one `tile` element.
28- **commit_object** requires `obj_id`.
29
30u8[16] _reserved_by_header
31u48 created_ms
32u40 size
33u64 obj_id
34u16 key_len
35u8[] key
36u48[] lpage_page_dev_offset
37u48[] tail_page_dev_offset
38// Put whatever you want here, it'll have the same lifecycle as the object. This could be raw bytes, serialised map, packed array of structs, slotted data, whatever.
39u16 assoc_data_len
40u8[] assoc_data
41
42**/
43
44#[derive(Clone, Copy)]
45pub(crate) struct ObjectOffsets {
46  key_len: u16,
47  lpage_count: u64,
48  tail_page_count: u8,
49  assoc_data_len: u16,
50}
51
52impl ObjectOffsets {
53  pub fn with_key_len(self, key_len: u16) -> Self {
54    Self { key_len, ..self }
55  }
56
57  pub fn with_lpages(self, lpage_count: u64) -> Self {
58    Self {
59      lpage_count,
60      ..self
61    }
62  }
63
64  pub fn with_tail_pages(self, tail_page_count: u8) -> Self {
65    Self {
66      tail_page_count,
67      ..self
68    }
69  }
70
71  pub fn with_assoc_data_len(self, assoc_data_len: u16) -> Self {
72    Self {
73      assoc_data_len,
74      ..self
75    }
76  }
77
78  pub fn _reserved_by_header(self) -> u64 {
79    0
80  }
81
82  pub fn created_ms(self) -> u64 {
83    self._reserved_by_header() + PAGE_HEADER_CAP
84  }
85
86  pub fn size(self) -> u64 {
87    self.created_ms() + 6
88  }
89
90  pub fn id(self) -> u64 {
91    self.size() + 5
92  }
93
94  pub fn key_len(self) -> u64 {
95    self.id() + 8
96  }
97
98  pub fn key(self) -> u64 {
99    self.key_len() + 2
100  }
101
102  pub fn lpages(self) -> u64 {
103    self.key() + u64::from(self.key_len)
104  }
105
106  pub fn lpage(self, idx: u64) -> u64 {
107    self.lpages() + 6 * idx
108  }
109
110  pub fn tail_pages(self) -> u64 {
111    self.lpage(self.lpage_count)
112  }
113
114  pub fn tail_page(self, idx: u8) -> u64 {
115    self.tail_pages() + 6 * u64::from(idx)
116  }
117
118  pub fn assoc_data_len(self) -> u64 {
119    self.tail_page(self.tail_page_count)
120  }
121
122  pub fn assoc_data(self) -> u64 {
123    self.assoc_data_len() + 2
124  }
125
126  pub fn _total_size(self) -> u64 {
127    self.assoc_data() + u64::from(self.assoc_data_len)
128  }
129}
130
131/// WARNING: This is only safe to use for getting offset of fields up to and including `key`. Call `with_*` methods to get offsets of other fields.
132pub(crate) const OBJECT_OFF: ObjectOffsets = ObjectOffsets {
133  assoc_data_len: 0,
134  key_len: 0,
135  lpage_count: 0,
136  tail_page_count: 0,
137};
138
139// This makes it so that a read of the object fields up to and including the key is at most exactly 512 bytes, which is a well-aligned well-sized no-waste read from most SSDs. In case you're worried that it's not long enough, this is 497 bytes:
140// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
141// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
142// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
143// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
144// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
145// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
146// aaaaaaaaaaaaaaaaa
147pub const OBJECT_KEY_LEN_MAX: u16 = 497;
148
149// Two benefits of this over Vec<u8>:
150// - Length type is u8, which is the type for `tail_page_count`.
151// - No heap allocations; small enough to copy.
152// How it works:
153// - Page sizes can only be from `2^8` to `2^32` (see `{MIN,MAX}_PAGE_SIZE_POW2`).
154// - There are only `32 - 8 = 24` page sizes.
155// - It takes 5 bits to represent 24 distinct values.
156// - There can be up to 24 entries, each taking 5 bits. It will take 5 bits to represent the length.
157// - We can't fit 125 bits neatly, so we split up into two shards (64-bit ints), and use 4 bits in each int to represent the length of up to 12 entries covered by its int, using exactly `4 + 12 * 5 = 64` bits.
158#[derive(Clone, Copy)]
159pub struct TailPageSizes(u64, u64);
160
161impl TailPageSizes {
162  pub fn new() -> Self {
163    Self(0, 0)
164  }
165
166  fn shard_get_len(shard: u64) -> u8 {
167    u8!((shard & (0b1111 << 60)) >> 60)
168  }
169
170  fn shard_set_len(shard: &mut u64, len: u8) {
171    assert!(len <= 12);
172    *shard &= !(0b1111 << 60);
173    *shard |= u64::from(len) << 60;
174  }
175
176  fn shard_get(shard: u64, idx: u8) -> u8 {
177    assert!(idx < 12);
178    let shift = idx * 5;
179    u8!((shard & (0b11111 << shift)) >> shift)
180  }
181
182  fn shard_set(shard: &mut u64, idx: u8, val: u8) {
183    assert!(val < (1 << 5));
184    let shift = idx * 5;
185    *shard &= !(0b11111 << shift);
186    *shard |= u64::from(val) << shift;
187  }
188
189  pub fn len(&self) -> u8 {
190    Self::shard_get_len(self.0) + Self::shard_get_len(self.1)
191  }
192
193  pub fn get(&self, idx: u8) -> Option<u8> {
194    if idx >= self.len() {
195      return None;
196    };
197    Some(if idx < 12 {
198      Self::shard_get(self.0, idx)
199    } else {
200      Self::shard_get(self.1, idx - 12)
201    })
202  }
203
204  pub fn push(&mut self, pow2: u8) {
205    let idx = self.len();
206    if idx < 12 {
207      Self::shard_set(&mut self.0, idx, pow2);
208      Self::shard_set_len(&mut self.0, idx + 1);
209    } else {
210      Self::shard_set(&mut self.1, idx - 12, pow2);
211      Self::shard_set_len(&mut self.1, idx - 12 + 1);
212    };
213  }
214}
215
216impl IntoIterator for TailPageSizes {
217  type IntoIter = TailPageSizesIter;
218  type Item = (u8, u8);
219
220  fn into_iter(self) -> Self::IntoIter {
221    TailPageSizesIter(self, 0)
222  }
223}
224
225// Convenient iterator that provides `u8` indices instead of `.enumerate()` and `u8!(idx)`.
226pub struct TailPageSizesIter(TailPageSizes, u8);
227
228impl Iterator for TailPageSizesIter {
229  type Item = (u8, u8);
230
231  fn next(&mut self) -> Option<Self::Item> {
232    let idx = self.1;
233    let entry = self.0.get(idx)?;
234    self.1 += 1;
235    Some((idx, entry))
236  }
237}
238
239pub(crate) struct ObjectLayout {
240  pub lpage_count: u64,
241  pub tail_page_sizes_pow2: TailPageSizes,
242}
243
244pub(crate) fn calc_object_layout(pages: &Pages, object_size: u64) -> ObjectLayout {
245  let (lpage_count, tail_size) = div_mod_pow2(object_size, pages.lpage_size_pow2);
246  let mut rem = ceil_pow2(tail_size, pages.spage_size_pow2);
247  let mut tail_page_sizes_pow2 = TailPageSizes::new();
248  loop {
249    let pos = rem.leading_zeros();
250    if pos == 64 {
251      break;
252    };
253    let pow2 = u8!(63 - pos);
254    tail_page_sizes_pow2.push(pow2);
255    rem &= !(1 << pow2);
256  }
257  ObjectLayout {
258    lpage_count,
259    tail_page_sizes_pow2,
260  }
261}
262
263#[must_use]
264pub(crate) struct ReleasedObject {
265  pub object_metadata_size: u64,
266  pub object_data_size: u64,
267}
268
269/// WARNING: This does not verify the page type, nor detach the inode from whatever list it's on, but will clear the page header via `alloc.release`.
270pub(crate) async fn release_object(
271  txn: &mut Transaction,
272  dev: &SeekableAsyncFile,
273  pages: &Pages,
274  alloc: &mut Allocator,
275  page_dev_offset: u64,
276  page_size_pow2: u8,
277) -> ReleasedObject {
278  let raw = dev.read_at(page_dev_offset, 1 << page_size_pow2).await;
279  let object_size = raw.read_u40_be_at(OBJECT_OFF.size());
280  let key_len = raw.read_u16_be_at(OBJECT_OFF.key_len());
281  let ObjectLayout {
282    lpage_count,
283    tail_page_sizes_pow2,
284  } = calc_object_layout(pages, object_size);
285
286  let off = OBJECT_OFF
287    .with_key_len(key_len)
288    .with_lpages(lpage_count)
289    .with_tail_pages(tail_page_sizes_pow2.len());
290  for i in 0..lpage_count {
291    let page_dev_offset = raw.read_u48_be_at(off.lpage(i));
292    alloc
293      .release(txn, page_dev_offset, pages.lpage_size_pow2)
294      .await;
295  }
296  for (i, tail_page_size_pow2) in tail_page_sizes_pow2 {
297    let page_dev_offset = raw.read_u48_be_at(off.tail_page(i));
298    alloc
299      .release(txn, page_dev_offset, tail_page_size_pow2)
300      .await;
301  }
302  alloc.release(txn, page_dev_offset, page_size_pow2).await;
303  ReleasedObject {
304    object_data_size: object_size,
305    object_metadata_size: off._total_size(),
306  }
307}