commonware-runtime 0.0.62

Execute asynchronous tasks with a configurable scheduler.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
use crate::{Blob, Error, RwLock};
use commonware_utils::StableBuf;
use futures::{future::Shared, FutureExt};
use std::{
    collections::{hash_map::Entry, HashMap},
    future::Future,
    num::NonZeroUsize,
    pin::Pin,
    sync::{
        atomic::{AtomicBool, AtomicU64, Ordering},
        Arc,
    },
};
use tracing::{debug, trace};

// Type alias for the future we'll be storing for each in-flight page fetch.
//
// We wrap [Error] in an Arc so it will be cloneable, which is required for the future to be
// [Shared].
type PageFetchFut = Shared<Pin<Box<dyn Future<Output = Result<StableBuf, Arc<Error>>> + Send>>>;

/// A [Pool] caches pages of [Blob] data in memory.
///
/// A single buffer pool can be used to cache data from multiple blobs by assigning a unique id to
/// each.
///
/// Implements the [Clock](https://en.wikipedia.org/wiki/Page_replacement_algorithm#Clock)
/// replacement policy, which is a lightweight approximation of LRU. The page `cache` is a circular
/// list of recently accessed pages, and `clock` is the index of the next page within it to examine
/// for replacement. When a page needs to be evicted, we start the search at `clock` within `cache`,
/// searching for the first page with a false reference bit, and setting any skipped page's
/// reference bit to false along the way.
pub struct Pool {
    /// The page cache index, with a key composed of (blob id, page number), that maps each cached
    /// page to the index of its `cache` entry.
    ///
    /// # Invariants
    ///
    /// Each `index` entry maps to exactly one `cache` entry, and that cache entry always has a
    /// matching key.
    index: HashMap<(u64, u64), usize>,

    /// The page cache.
    ///
    /// Each `cache` entry has exactly one corresponding `index` entry.
    cache: Vec<CacheEntry>,

    /// The Clock replacement policy's clock hand index into `cache`.
    clock: usize,

    /// The maximum number of pages that will be cached.
    capacity: usize,

    /// A map of currently executing page fetches to ensure only one task at a time is trying to
    /// fetch a specific page.
    page_fetches: HashMap<(u64, u64), PageFetchFut>,
}

struct CacheEntry {
    /// The cache key which is composed of the blob id and page number of the page.
    key: (u64, u64),

    /// A bit indicating whether this page was recently referenced.
    referenced: AtomicBool,

    /// The cached page itself.
    data: Vec<u8>,
}

/// A reference to a [Pool] that can be shared across threads via cloning, along with the page size
/// that will be used with it. Provides the API for interacting with the buffer pool in a
/// thread-safe manner.
#[derive(Clone)]
pub struct PoolRef {
    /// The size of each page in the buffer pool.
    pub(super) page_size: usize,

    /// The next id to assign to a blob that will be managed by this pool.
    next_id: Arc<AtomicU64>,

    /// Shareable reference to the buffer pool.
    pool: Arc<RwLock<Pool>>,
}

impl PoolRef {
    /// Returns a new [PoolRef] with the given `page_size` and `capacity`.
    pub fn new(page_size: NonZeroUsize, capacity: NonZeroUsize) -> Self {
        Self {
            page_size: page_size.get(),
            next_id: Arc::new(AtomicU64::new(0)),
            pool: Arc::new(RwLock::new(Pool::new(capacity.get()))),
        }
    }

    /// Returns a unique id for the next blob that will use this buffer pool.
    pub async fn next_id(&self) -> u64 {
        self.next_id.fetch_add(1, Ordering::Relaxed)
    }

    /// Convert an offset into the number of the page it belongs to and the offset within that page.
    pub fn offset_to_page(&self, offset: u64) -> (u64, usize) {
        Pool::offset_to_page(self.page_size, offset)
    }

    /// Read the specified bytes, preferentially from the buffer pool cache. Bytes not found in the
    /// buffer pool will be read from the provided `blob` and cached for future reads.
    ///
    /// # Warning
    ///
    /// Attempts to read any of the last (blob_size % page_size) "trailing bytes" of the blob will
    /// result in a ReadFailed error since the buffer pool only deals with page sized chunks.
    /// Trailing bytes need to be dealt with outside of the buffer pool. For example,
    /// [crate::buffer::Append] uses a [crate::buffer::tip::Buffer] to buffer them.
    pub(super) async fn read<B: Blob>(
        &self,
        blob: &B,
        blob_id: u64,
        mut buf: &mut [u8],
        mut offset: u64,
    ) -> Result<(), Error> {
        // Read up to a page worth of data at a time from either the buffer pool or the `blob`,
        // until the requested data is fully read.
        while !buf.is_empty() {
            // Read lock the buffer pool and see if we can get (some of) the data from it.
            {
                let buffer_pool = self.pool.read().await;
                let count = buffer_pool.read_at(self.page_size, blob_id, buf, offset);
                if count != 0 {
                    offset += count as u64;
                    buf = &mut buf[count..];
                    continue;
                }
            }

            // Handle page fault.
            let count = self
                .read_after_page_fault(blob, blob_id, buf, offset)
                .await?;
            offset += count as u64;
            buf = &mut buf[count..];
        }

        Ok(())
    }

    /// Fetch the specified page after encountering a page fault, which may involve retrieving it
    /// from `blob` & caching the result in `pool`. Returns the number of bytes read, which should
    /// always be non-zero.
    async fn read_after_page_fault<B: Blob>(
        &self,
        blob: &B,
        blob_id: u64,
        buf: &mut [u8],
        offset: u64,
    ) -> Result<usize, Error> {
        assert!(!buf.is_empty());

        let (page_num, offset_in_page) = Pool::offset_to_page(self.page_size, offset);
        let page_size = self.page_size;
        trace!(page_num, blob_id, "page fault");

        // Create or clone a future that retrieves the desired page from the underlying blob. This
        // requires a write lock on the buffer pool since we may need to modify `page_fetches` if
        // this is the first fetcher.
        let (fetch_future, is_first_fetcher) = {
            let mut pool = self.pool.write().await;

            // There's a (small) chance the page was fetched & buffered by another task before we
            // were able to acquire the write lock, so check the cache before doing anything else.
            let count = pool.read_at(page_size, blob_id, buf, offset);
            if count != 0 {
                return Ok(count);
            }

            let entry = pool.page_fetches.entry((blob_id, page_num));
            match entry {
                Entry::Occupied(o) => {
                    // Another thread is already fetching this page, so clone its existing future.
                    (o.get().clone(), false)
                }
                Entry::Vacant(v) => {
                    // Nobody is currently fetching this page, so create a future that will do the work.
                    let blob = blob.clone();
                    let future = async move {
                        blob.read_at(vec![0; page_size], page_num * page_size as u64)
                            .await
                            .map_err(Arc::new)
                    };

                    // Make the future shareable and insert it into the map.
                    let shareable = future.boxed().shared();
                    v.insert(shareable.clone());

                    (shareable, true)
                }
            }
        };

        // Await the future and get the page buffer. If this isn't the task that initiated the
        // fetch, we can return immediately with the result. Note that we cannot return immediately
        // on error, since we'd bypass the cleanup required of the first fetcher.
        let fetch_result = fetch_future.await;
        if !is_first_fetcher {
            // Copy the requested portion of the page into the buffer and return immediately.
            let page_buf: Vec<u8> = fetch_result.map_err(|_| Error::ReadFailed)?.into();
            let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
            buf[..bytes_to_copy]
                .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]);
            return Ok(bytes_to_copy);
        }

        // This is the task that initiated the fetch, so it is responsible for cleaning up the
        // inserted entry, and caching the page in the buffer pool if the fetch didn't error out.
        // This requires a write lock on the buffer pool to modify `page_fetches` and cache the
        // page.
        let mut pool = self.pool.write().await;

        // Remove the entry from `page_fetches`.
        let _ = pool.page_fetches.remove(&(blob_id, page_num));

        // Cache the result in the buffer pool.
        let Ok(page_buf) = fetch_result else {
            return Err(Error::ReadFailed);
        };
        pool.cache(page_size, blob_id, page_buf.as_ref(), page_num);

        // Copy the requested portion of the page into the buffer.
        let page_buf: Vec<u8> = page_buf.into();
        let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
        buf[..bytes_to_copy]
            .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]);

        Ok(bytes_to_copy)
    }

    /// Cache the provided slice of data in the buffer pool, returning the remaining bytes that
    /// didn't fill a whole page. `offset` must be page aligned.
    ///
    /// # Panics
    ///
    /// Panics if `offset` is not page aligned.
    pub async fn cache(&self, blob_id: u64, mut buf: &[u8], offset: u64) -> usize {
        let (mut page_num, offset_in_page) = self.offset_to_page(offset);
        assert_eq!(offset_in_page, 0);
        {
            // Write lock the buffer pool.
            let mut buffer_pool = self.pool.write().await;
            while buf.len() >= self.page_size {
                buffer_pool.cache(self.page_size, blob_id, &buf[..self.page_size], page_num);
                buf = &buf[self.page_size..];
                page_num += 1;
            }
        }

        buf.len()
    }
}

impl Pool {
    /// Return a new empty buffer pool with an initial next-blob id of 0, and a max cache capacity
    /// of `capacity` pages.
    ///
    /// # Panics
    ///
    /// Panics if `capacity` is 0.
    pub fn new(capacity: usize) -> Self {
        assert!(capacity > 0);
        Self {
            index: HashMap::new(),
            cache: Vec::new(),
            clock: 0,
            capacity,
            page_fetches: HashMap::new(),
        }
    }

    /// Convert an offset into the number of the page it belongs to and the offset within that page.
    fn offset_to_page(page_size: usize, offset: u64) -> (u64, usize) {
        (
            offset / page_size as u64,
            (offset % page_size as u64) as usize,
        )
    }

    /// Attempt to fetch blob data starting at `offset` from the buffer pool. Returns the number of
    /// bytes read, which could be 0 if the first page in the requested range isn't buffered, and is
    /// never more than `self.page_size` or the length of `buf`. The returned bytes won't cross a
    /// page boundary, so multiple reads may be required even if all data in the desired range is
    /// buffered.
    fn read_at(&self, page_size: usize, blob_id: u64, buf: &mut [u8], offset: u64) -> usize {
        let (page_num, offset_in_page) = Self::offset_to_page(page_size, offset);
        let page_index = self.index.get(&(blob_id, page_num));
        let Some(&page_index) = page_index else {
            return 0;
        };
        let page = &self.cache[page_index];
        assert_eq!(page.key, (blob_id, page_num));
        page.referenced.store(true, Ordering::Relaxed);
        let page = &page.data;

        let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
        buf[..bytes_to_copy].copy_from_slice(&page[offset_in_page..offset_in_page + bytes_to_copy]);

        bytes_to_copy
    }

    /// Put the given `page` into the buffer pool.
    ///
    /// # Panics
    ///
    /// Panics if the provided page is not exactly PAGE_SIZE bytes long.
    fn cache(&mut self, page_size: usize, blob_id: u64, page: &[u8], page_num: u64) {
        assert_eq!(page.len(), page_size);

        let key = (blob_id, page_num);
        let index_entry = self.index.entry(key);
        if let Entry::Occupied(index_entry) = index_entry {
            // This case can result when a blob is truncated across a page boundary, and later grows
            // back to (beyond) its original size. It will also become expected behavior once we
            // allow cached pages to be writable.
            debug!(blob_id, page_num, "updating duplicate page");

            // Update the stale data with the new page.
            let entry = &mut self.cache[*index_entry.get()];
            assert_eq!(entry.key, key);
            entry.referenced.store(true, Ordering::Relaxed);
            entry.data.copy_from_slice(page);
            return;
        }

        if self.cache.len() < self.capacity {
            self.index.insert(key, self.cache.len());
            self.cache.push(CacheEntry {
                key,
                referenced: AtomicBool::new(true),
                data: page.into(),
            });
            return;
        }

        // Cache is full, find a page to evict.
        while self.cache[self.clock].referenced.load(Ordering::Relaxed) {
            self.cache[self.clock]
                .referenced
                .store(false, Ordering::Relaxed);
            self.clock = (self.clock + 1) % self.cache.len();
        }

        // Evict the page by replacing it with the new page.
        let entry = &mut self.cache[self.clock];
        entry.referenced.store(true, Ordering::Relaxed);
        assert!(self.index.remove(&entry.key).is_some());
        self.index.insert(key, self.clock);
        entry.key = key;
        entry.data.copy_from_slice(page);

        // Move the clock forward.
        self.clock = (self.clock + 1) % self.cache.len();
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{deterministic, Runner as _, Storage as _};
    use commonware_macros::test_traced;
    use commonware_utils::NZUsize;

    const PAGE_SIZE: usize = 1024;

    #[test_traced]
    fn test_pool_basic() {
        let mut pool: Pool = Pool::new(10);

        let mut buf = vec![0; PAGE_SIZE];
        let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
        assert_eq!(bytes_read, 0);

        pool.cache(PAGE_SIZE, 0, &[1; PAGE_SIZE], 0);
        let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
        assert_eq!(bytes_read, PAGE_SIZE);
        assert_eq!(buf, [1; PAGE_SIZE]);

        // Test replacement -- should log a duplicate page warning but still work.
        pool.cache(PAGE_SIZE, 0, &[2; PAGE_SIZE], 0);
        let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
        assert_eq!(bytes_read, PAGE_SIZE);
        assert_eq!(buf, [2; PAGE_SIZE]);

        // Test exceeding the cache capacity.
        for i in 0u64..11 {
            pool.cache(PAGE_SIZE, 0, &[i as u8; PAGE_SIZE], i);
        }
        // Page 0 should have been evicted.
        let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
        assert_eq!(bytes_read, 0);
        // Page 1-10 should be in the cache.
        for i in 1u64..11 {
            let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, i * PAGE_SIZE as u64);
            assert_eq!(bytes_read, PAGE_SIZE);
            assert_eq!(buf, [i as u8; PAGE_SIZE]);
        }

        // Test reading from an unaligned offset by adding 2 to an aligned offset. The read
        // should be 2 bytes short of a full page.
        let mut buf = vec![0; PAGE_SIZE];
        let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, PAGE_SIZE as u64 + 2);
        assert_eq!(bytes_read, PAGE_SIZE - 2);
        assert_eq!(&buf[..PAGE_SIZE - 2], [1; PAGE_SIZE - 2]);
    }

    #[test_traced]
    fn test_pool_read_with_blob() {
        // Initialize the deterministic context
        let executor = deterministic::Runner::default();
        // Start the test within the executor
        executor.start(|context| async move {
            // Populate a blob with 11 consecutive pages of data.
            let (blob, size) = context
                .open("test", "blob".as_bytes())
                .await
                .expect("Failed to open blob");
            assert_eq!(size, 0);
            for i in 0..11 {
                let buf = vec![i as u8; PAGE_SIZE];
                blob.write_at(buf, i * PAGE_SIZE as u64).await.unwrap();
            }

            // Fill the buffer pool with the blob's data.
            let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
            assert_eq!(pool_ref.next_id().await, 0);
            assert_eq!(pool_ref.next_id().await, 1);
            for i in 0..11 {
                let mut buf = vec![0; PAGE_SIZE];
                pool_ref
                    .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64)
                    .await
                    .unwrap();
                assert_eq!(buf, [i as u8; PAGE_SIZE]);
            }

            // Repeat the read to exercise reading from the buffer pool. Must start at 1 because
            // page 0 should be evicted.
            for i in 1..11 {
                let mut buf = vec![0; PAGE_SIZE];
                pool_ref
                    .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64)
                    .await
                    .unwrap();
                assert_eq!(buf, [i as u8; PAGE_SIZE]);
            }

            // Cleanup.
            blob.sync().await.unwrap();
        });
    }
}