1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#pragma once
#include <atomic>
#include <cstdint>
#include <functional>
#include <memory>
#include <vector>
#include "common/types/types.h"
#include "storage/buffer_manager/memory_manager.h"
#include "storage/buffer_manager/page_state.h"
#include "storage/enums/page_read_policy.h"
#include "storage/file_handle.h"
namespace lbug {
namespace main {
struct DBConfig;
};
namespace common {
class VirtualFileSystem;
};
namespace testing {
class FlakyBufferManager;
class BufferManagerTest;
class CopyTestHelper;
}; // namespace testing
namespace storage {
class ChunkedNodeGroup;
class Spiller;
// This class keeps state info for pages potentially can be evicted.
// The page state of a candidate is set to be MARKED when it is first enqueued. After enqueued, if
// the candidate was recently accessed, it is no longer immediately evictable. See the state
// transition diagram above `BufferManager` class declaration for more details.
struct EvictionCandidate {
friend class EvictionQueue;
// If the candidate is MARKED, it is evictable.
static bool isEvictable(uint64_t currPageStateAndVersion) {
return PageState::getState(currPageStateAndVersion) == PageState::MARKED;
}
// If the candidate was recently read optimistically, it is second chance evictable.
static bool isSecondChanceEvictable(uint64_t currPageStateAndVersion) {
return PageState::getState(currPageStateAndVersion) == PageState::UNLOCKED;
}
static bool isEvicted(uint64_t currPageStateAndVersion) {
return PageState::getState(currPageStateAndVersion) == PageState::EVICTED;
}
bool operator==(const EvictionCandidate& other) const {
return fileIdx == other.fileIdx && pageIdx == other.pageIdx;
}
// Returns false if the candidate was not empty, or if another thread set the value first
bool set(const EvictionCandidate& newCandidate);
uint32_t fileIdx = UINT32_MAX;
common::page_idx_t pageIdx = common::INVALID_PAGE_IDX;
};
// A circular buffer queue storing eviction candidates
// One candidate should be stored for each page currently in memory
class EvictionQueue {
public:
static constexpr auto EMPTY = EvictionCandidate{UINT32_MAX, common::INVALID_PAGE_IDX};
static constexpr size_t BATCH_SIZE = 64;
explicit EvictionQueue(uint64_t capacity)
// Capacity needs to be a multiple of the batch size
: insertCursor{0}, evictionCursor{0}, size{0},
capacity{capacity + (BATCH_SIZE - capacity % BATCH_SIZE)},
data{std::make_unique<std::atomic<EvictionCandidate>[]>(this->capacity)} {}
bool insert(uint32_t fileIndex, common::page_idx_t pageIndex);
// Produces the next non-empty candidate to be tried for eviction.
// Note that it is still possible (though unlikely) for another thread to evict this candidate,
// so it is not guaranteed to be empty.
// The PageState should be locked, and then the atomic checked against the version used when
// locking the page state to make sure there wasn't a data race
std::span<std::atomic<EvictionCandidate>, BATCH_SIZE> next();
void clear(std::atomic<EvictionCandidate>& candidate);
bool tryClear(std::atomic<EvictionCandidate>& candidate, const EvictionCandidate& expected);
uint64_t getSize() const { return size; }
uint64_t getEvictionCursor() const { return evictionCursor; }
uint64_t getCapacity() const { return capacity; }
private:
std::atomic<uint64_t> insertCursor;
std::atomic<uint64_t> evictionCursor;
std::atomic<uint64_t> size;
const uint64_t capacity;
std::unique_ptr<std::atomic<EvictionCandidate>[]> data;
};
/**
* The Buffer Manager (BM) is a centralized manager of database memory resources.
* It provides two main functionalities:
* 1) it provides the high-level functionality to pin() and unpin() the pages of the database files
* used by storage structures, such as the Column, Lists, or HashIndex in the storage layer, and
* operates via their FileHandle to read/write the page data into/out of one of the frames.
* 2) it provides optimistic read of pages, which optimistically read unlocked or marked pages
* without acquiring locks.
* 3) it supports the MemoryManager (MM) to allocate memory buffers that are not
* backed by any disk files. Similar to disk files, MM provides in-mem file handles to the BM to
* pin/unpin pages. Pin happens when MM tries to allocate a new memory buffer, and unpin happens
* when MM tries to reclaim a memory buffer.
*
* Specifically, in BM's context, page is the basic management unit of data in a file. The file can
* be a disk file, such as a column file, or an in-mem file, such as an temp in-memory file kept by
* the MM. Frame is the basic management unit of data resides in a VMRegion, namely in a virtual
* memory space. Each page is uniquely mapped to a frame, and it can be cached into or evicted from
* the frame. See `VMRegion` for more details.
*
* When users unpin their pages, the BM might spill them to disk. The behavior of what is guaranteed
* to be kept in frame and what can be spilled to disk is directly determined by the pin/unpin
* calls of the users.
*
* Also, BM provides some specialized functionalities for WAL files:
* 1) it supports the caller to set pinned pages as dirty, which will be safely written back to disk
* when the pages are evicted;
* 2) it supports the caller to flush or remove pages from the BM;
* 3) it supports the caller to directly update the content of a frame.
*
* All accesses to the BM are through a FileHandle. This design is to decentralize the management
* of page states from the BM to each file handle itself. Thus, each on-disk file should have a
* unique FileHandle, and MM also holds a unique FileHandle, which is backed by a temp in-mem
* file, for all memory buffer allocations
*
* To start a Database, users need to specify the max size of the memory usage (`maxSize`) in BM.
* If users don't specify the value, the system will set maxSize to available physical mem *
* DEFAULT_PHY_MEM_SIZE_RATIO_FOR_BM (defined in constants.h).
* The BM relies on virtual memory regions mapped through `mmap` to anonymous address spaces.
* 1) For disk pages, BM allocates a virtual memory region of DEFAULT_VM_REGION_MAX_SIZE (defined in
* constants.h), which is usually much larger than `maxSize`, and is expected to be large enough to
* contain all disk pages. Each disk page in database files is directly mapped to a unique
* PAGE_SIZE frame in the region.
* 2) For each FileHandle backed by a temp in-mem file in MM, BM allocates a virtual memory region
* of `maxSize` for it. Each memory buffer is mapped to a unique TEMP_PAGE_SIZE frame in that
* region. Both disk pages and memory buffers are all managed by the BM to make sure that actually
* used physical memory doesn't go beyond max size specified by users. Currently, the BM uses a
* queue based replacement policy and the MADV_DONTNEED hint to explicitly control evictions. See
* comments above `claimAFrame()` for more details.
*
* Page states in BM:
* A page can be in one of the four states: a) LOCKED, b) UNLOCKED, c) MARKED, d) EVICTED.
* Every page is initialized as in the EVICTED state.
* The state transition diagram of page X is as follows (oRead refers to optimisticRead):
* Note: optimistic reads on UNLOCKED pages don't make any changes to pages' states. oRead on
* UNLOCKED is omitted in the diagram.
*
* 7.2. pin(pY): evict pX. 7.1. pin(pY): tryLock(pX)
* |<-------------------------|<------------------------------------------------------------|
* | | 4. pin(pX) |
* | |<------------------------------------------------------------|
* | 1. pin(pX) | 5. pin(pX) 6. pin(pY): 2nd chance eviction |
* EVICTED ------------------> LOCKED <-------------UNLOCKED ------------------------------> MARKED
* | | 3. oRead(pX) |
* | <--------------------------------------|
* | 2. unpin(pX): enqueue pX & increment version |
* ------------------------------------------------------------->
*
* 1. When page pX at EVICTED state, and it is pinned, it transits to the Locked state. `pin` will
* first acquire the exclusive lock on the page, then read the page from the disk into its frame.
* The caller can safely make changes to the page.
* 2. When the caller finishes changes to the page, it calls `unpin`, which releases the lock on the
* page, puts the page into the eviction queue, and increments its version. The page now transits to
* the MARKED state. Note that currently the page is still cached, but it is ready to be evicted.
* The page version number is used to identify any potential writing on the page. Each time a page
* transits from LOCKED to MARKED state, we will increment its version. This happens when a page is
* pinned, then unpinned. During the pin and unpin, we assume the page's content in its
* corresponding frame might have changed, thus, we increment the version number to forbid stale
* reads on it;
* 3. The MARKED page can be optimistically read by the caller, setting the page's state to
* UNLOCKED. For evicted pages, optimistic reads will trigger pin and unpin to read pages from disk
* into frames.
* 4. The MARKED page can be pinned again by the caller, setting the page's state to LOCKED.
* 5. The UNLOCKED page can also be pinned again by the caller, setting the page's state to LOCKED.
* 6. During eviction, UNLOCKED pages will be checked if they are second chance evictable. If so,
* they will be set to MARKED, and their eviction candidates will be moved back to the eviction
* queue.
* 7. During eviction, if the page is in the MARKED state, it will be LOCKED first (7.1), then
* removed from its frame, and set to EVICTED (7.2).
*
* The design is inspired by vmcache in the paper "Virtual-Memory Assisted Buffer Management"
* (https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).
* We would also like to thank Fadhil Abubaker for doing the initial research and prototyping of
* Umbra's design in his CS 848 course project:
* https://github.com/fabubaker/lbug/blob/umbra-bm/final_project_report.pdf.
*/
class BufferManager {
friend class testing::FlakyBufferManager;
friend class testing::BufferManagerTest;
friend class testing::CopyTestHelper;
friend class FileHandle;
friend class MemoryManager;
public:
BufferManager(const std::string& databasePath, const std::string& spillToDiskPath,
uint64_t bufferPoolSize, uint64_t maxDBSize, common::VirtualFileSystem* vfs, bool readOnly);
virtual ~BufferManager();
// Currently, these functions are specifically used only for WAL files.
void removeFilePagesFromFrames(FileHandle& fileHandle);
void updateFrameIfPageIsInFrameWithoutLock(common::file_idx_t fileIdx, const uint8_t* newPage,
common::page_idx_t pageIdx);
// Thread-safe variant that acquires the page state lock before updating the frame.
void updateFrameIfPageIsInFrame(common::file_idx_t fileIdx, const uint8_t* newPage,
common::page_idx_t pageIdx);
// For files that are managed by BM, their FileHandles should be created through this function.
FileHandle* getFileHandle(const std::string& filePath, uint8_t flags,
common::VirtualFileSystem* vfs, main::ClientContext* context) {
fileHandles.emplace_back(
std::make_unique<FileHandle>(filePath, flags, this, fileHandles.size(), vfs, context));
return fileHandles.back().get();
}
uint64_t getMemoryLimit() const { return bufferPoolSize; }
uint64_t getUsedMemory() const { return usedMemory; }
void getSpillerOrSkip(std::function<void(Spiller&)> func) {
if (spiller) {
return func(*spiller);
}
}
void resetSpiller(std::string spillPath);
// This function only works when run in a single-threaded context
// Iterates through the eviction queue and removes any elements that have already been evicted
// (due to some external intervention)
void removeEvictedCandidates();
protected:
// Reclaims used memory until the given size to reserve is available.
// The specified amount of memory will be recorded as being used
virtual bool reserve(uint64_t sizeToReserve);
private:
uint8_t* pin(FileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy = PageReadPolicy::READ_PAGE);
void optimisticRead(FileHandle& fileHandle, common::page_idx_t pageIdx,
const std::function<void(uint8_t*)>& func);
// The function assumes that the requested page is already pinned.
void unpin(FileHandle& fileHandle, common::page_idx_t pageIdx);
uint8_t* getFrame(FileHandle& fileHandle, common::page_idx_t pageIdx) const {
#if BM_MALLOC
return fileHandle.getPageState(pageIdx)->getPage();
#else
return vmRegions[fileHandle.getPageSizeClass()]->getFrame(fileHandle.getFrameIdx(pageIdx));
#endif
}
common::frame_group_idx_t addNewFrameGroup(
common::PageSizeClass pageSizeClass [[maybe_unused]]) {
#if BM_MALLOC
return 0;
#else
return vmRegions[pageSizeClass]->addNewFrameGroup();
#endif
}
void removePageFromFrameIfNecessary(FileHandle& fileHandle, common::page_idx_t pageIdx);
static void verifySizeParams(uint64_t bufferPoolSize, uint64_t maxDBSize);
bool claimAFrame(FileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy);
// Return number of bytes freed.
uint64_t tryEvictPage(std::atomic<EvictionCandidate>& candidate);
void cachePageIntoFrame(FileHandle& fileHandle, common::page_idx_t pageIdx,
PageReadPolicy pageReadPolicy);
void removePageFromFrame(FileHandle& fileHandle, common::page_idx_t pageIdx, bool shouldFlush);
uint64_t freeUsedMemory(uint64_t size);
void releaseFrameForPage(FileHandle& fileHandle [[maybe_unused]],
common::page_idx_t pageIdx [[maybe_unused]]) {
#if BM_MALLOC
// Page is freed instead in PageState::resetToEvicted
#else
vmRegions[fileHandle.getPageSizeClass()]->releaseFrame(fileHandle.getFrameIdx(pageIdx));
#endif
}
uint64_t evictPages();
private:
std::atomic<uint64_t> bufferPoolSize;
EvictionQueue evictionQueue;
// Total memory used
std::atomic<uint64_t> usedMemory;
// Amount of memory used, which cannot be evicted
std::atomic<uint64_t> nonEvictableMemory;
// Each VMRegion corresponds to a virtual memory region of a specific page size. Currently, we
// hold two sizes of REGULAR_PAGE and TEMP_PAGE.
std::array<std::unique_ptr<VMRegion>, 2> vmRegions;
std::vector<std::unique_ptr<FileHandle>> fileHandles;
std::unique_ptr<Spiller> spiller;
common::VirtualFileSystem* vfs;
};
} // namespace storage
} // namespace lbug