1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//! # Batch Write
//!
//! This module adds batch write functionality to DbInner. Prior to this feature,
//! writes were performed directly in DbInner's `put_with_options` and
//! `delete_with_options` methods. For each operation, a lock was acquired on the
//! db_state to mutate the WAL or memtable. This worked fine for single writes,
//! but for batch writes, which take longer, it could create contention on the lock
//! because. This is dangerous in an async runtime because it can block the
//! threads, leading to starvation.
//!
//! This module spawns a separate task to handle batch writes. The task receives
//! a `WriteBatchMsg``, which contains a `WriteBatchRequest``. The `WriteBatchRequest`
//! contains a `WriteBatch` containing Put/Delete operations and a `oneshot::Sender`.
//! The `Sender` is used to send the table that the batch was written to back to the
//! caller so the caller can `.await` the result. The result is that callers safely
//! `.await` on their writes rather than holding a lock on the db_state.
//!
//! Centralizing the writes in a single event loop also provides a single location to
//! assign sequence numbers when we implement MVCC.
//!
//! [Pebble](https://github.com/cockroachdb/pebble) has a similar design and
//! [a good write-up](https://github.com/cockroachdb/pebble/blob/master/docs/rocksdb.md#commit-pipeline)
//! describing its benefits.
//!
//! _Note: The `write_batch` loop still holds a lock on the db_state. There can still
//! be contention between `get`s, which holds a lock, and the write loop._
use core::panic;
use log::{info, warn};
use std::sync::Arc;
use tokio::runtime::Handle;
use crate::types::{RowEntry, ValueDeletable};
use crate::utils::spawn_bg_task;
use crate::{
batch::{WriteBatch, WriteOp},
db::DbInner,
error::SlateDBError,
mem_table::KVTable,
};
pub(crate) enum WriteBatchMsg {
Shutdown,
WriteBatch(WriteBatchRequest),
}
pub(crate) struct WriteBatchRequest {
pub(crate) batch: WriteBatch,
pub(crate) done: tokio::sync::oneshot::Sender<Result<Arc<KVTable>, SlateDBError>>,
}
impl DbInner {
#[allow(clippy::panic)]
async fn write_batch(&self, batch: WriteBatch) -> Result<Arc<KVTable>, SlateDBError> {
let now = self.mono_clock.now().await?;
let current_table = if self.wal_enabled() {
let mut guard = self.state.write();
let seq = guard.increment_seq();
let current_wal = guard.wal();
for op in batch.ops {
match op {
WriteOp::Put(key, value, opts) => {
current_wal.put(RowEntry {
key,
value: ValueDeletable::Value(value),
create_ts: Some(now),
expire_ts: opts.expire_ts_from(self.options.default_ttl, now),
seq,
});
}
WriteOp::Delete(key) => {
current_wal.put(RowEntry {
key,
value: ValueDeletable::Tombstone,
create_ts: Some(now),
expire_ts: None,
seq,
});
}
}
}
let table = current_wal.table().clone();
self.maybe_freeze_wal(&mut guard)?;
table
} else {
if cfg!(not(feature = "wal_disable")) {
panic!("wal_disabled feature must be enabled");
}
let mut guard = self.state.write();
let seq = guard.increment_seq();
let current_memtable = guard.memtable();
for op in batch.ops {
match op {
WriteOp::Put(key, value, opts) => {
current_memtable.put(RowEntry {
key,
value: ValueDeletable::Value(value),
create_ts: Some(now),
expire_ts: opts.expire_ts_from(self.options.default_ttl, now),
seq,
});
}
WriteOp::Delete(key) => {
current_memtable.put(RowEntry {
key,
value: ValueDeletable::Tombstone,
create_ts: Some(now),
expire_ts: None,
seq,
});
}
}
}
let table = current_memtable.table().clone();
let last_wal_id = guard.last_written_wal_id();
self.maybe_freeze_memtable(&mut guard, last_wal_id)?;
table
};
Ok(current_table)
}
pub(crate) fn spawn_write_task(
self: &Arc<Self>,
mut rx: tokio::sync::mpsc::UnboundedReceiver<WriteBatchMsg>,
tokio_handle: &Handle,
) -> Option<tokio::task::JoinHandle<Result<(), SlateDBError>>> {
let this = Arc::clone(self);
let mut is_stopped = false;
let fut = async move {
while !(is_stopped && rx.is_empty()) {
match rx.recv().await.expect("unexpected channel close") {
WriteBatchMsg::WriteBatch(write_batch_request) => {
let WriteBatchRequest { batch, done } = write_batch_request;
let result = this.write_batch(batch).await;
_ = done.send(result);
}
WriteBatchMsg::Shutdown => {
is_stopped = true;
}
}
}
Ok(())
};
let this = Arc::clone(self);
Some(spawn_bg_task(
tokio_handle,
move |result| {
let err = match result {
Ok(()) => {
info!("write task shutdown complete");
SlateDBError::BackgroundTaskShutdown
}
Err(err) => {
warn!("write task exited with {:?}", err);
err.clone()
}
};
// notify any waiters about the failure
let mut state = this.state.write();
state.record_fatal_error(err.clone());
},
fut,
))
}
}