txn_db/store.rs
1//! The version store: where committed versions live.
2//!
3//! `txn-db` is the transaction layer, not the storage layer. It owns
4//! visibility, conflict detection, and commit ordering, but it delegates the
5//! actual keeping of versioned bytes to a [`VersionStore`]. That trait is the
6//! crate's Tier-3 seam: implement it over an LSM tree, a B-tree, a remote
7//! service — anything that can keep multiple timestamped versions of a key —
8//! and the transaction semantics compose on top unchanged.
9//!
10//! A [`MemoryStore`] ships for the common in-process case, for tests, and for
11//! examples. It is the default backing store of [`Db::new`](crate::Db::new).
12//!
13//! ## The contract a store must uphold
14//!
15//! A correct [`VersionStore`] keeps, for each key, the history of versions it
16//! has been asked to apply, each tagged with the commit timestamp it was applied
17//! at. Its two obligations are:
18//!
19//! - [`get`](VersionStore::get) returns the *newest* version whose commit
20//! timestamp is less than or equal to the caller's snapshot timestamp — the
21//! snapshot-read rule. A tombstone (a delete) at that position reads as
22//! "absent".
23//! - [`try_commit`](VersionStore::try_commit) validates a transaction's read and
24//! write sets against its snapshot and, if nothing conflicts, installs its
25//! writes at the commit timestamp — atomically with respect to other commits
26//! touching the same keys. This single method is what makes the store the
27//! serialization point for concurrent commits.
28//!
29//! ## Sharding
30//!
31//! [`MemoryStore`] partitions keys across independent shards, each with its own
32//! lock. Reads and commits that touch disjoint shards proceed without
33//! contending; a commit locks only the shards its keys fall in, in a fixed order
34//! so concurrent commits cannot deadlock. This is the sharded commit path the
35//! single global commit lock of the foundation release grew into.
36
37use std::collections::HashMap;
38use std::sync::Arc;
39
40use crate::error::{Result, TxnError};
41use crate::sync::{self, RwLock, RwLockWriteGuard};
42use crate::timestamp::Timestamp;
43
44/// One entry in a commit batch handed to [`VersionStore::try_commit`].
45///
46/// A key paired with the value to write at the commit timestamp (`Some`) or a
47/// tombstone marking a delete (`None`).
48pub type WriteEntry = (Arc<[u8]>, Option<Arc<[u8]>>);
49
50/// Default number of shards. A power of two so the shard index is a mask, not a
51/// division. Sixteen spreads contention well for in-process workloads without
52/// the per-commit cost of locking a long list of shards. Loom builds use far
53/// fewer to keep the interleaving search tractable.
54#[cfg(not(loom))]
55const DEFAULT_SHARDS: usize = 16;
56#[cfg(loom)]
57const DEFAULT_SHARDS: usize = 2;
58
59/// A keeper of timestamped versions, the backend a [`Db`](crate::Db) is built on.
60///
61/// This is the extension point for plugging `txn-db` onto a real storage
62/// engine. The transaction layer supplies the snapshot timestamps and the read
63/// and write sets; the store stores versions and enforces, atomically, that a
64/// commit only lands when nothing it depends on has changed. The two methods
65/// below state the precise contract.
66///
67/// Implementations must be `Send + Sync`: a [`Db`](crate::Db) shares one store
68/// across every thread that holds a clone of it.
69///
70/// # Examples
71///
72/// Driving the shipped [`MemoryStore`] directly through the trait:
73///
74/// ```
75/// use std::sync::Arc;
76/// use txn_db::{MemoryStore, Timestamp, VersionStore};
77///
78/// let store = MemoryStore::new();
79/// let key: Arc<[u8]> = Arc::from(&b"k"[..]);
80///
81/// // Commit one version at timestamp 1 (snapshot 0, no reads to validate).
82/// store.try_commit(
83/// Timestamp::ZERO,
84/// Timestamp::from_raw(1),
85/// vec![(key.clone(), Some(Arc::from(&b"v1"[..])))],
86/// &[],
87/// )?;
88///
89/// // A reader at timestamp 1 sees it; a reader at timestamp 0 does not.
90/// assert_eq!(store.get(b"k", Timestamp::from_raw(1))?.as_deref(), Some(&b"v1"[..]));
91/// assert_eq!(store.get(b"k", Timestamp::ZERO)?, None);
92/// # Ok::<(), txn_db::TxnError>(())
93/// ```
94pub trait VersionStore: Send + Sync {
95 /// Return the value of `key` visible at `read_ts`.
96 ///
97 /// The result is the value of the newest version of `key` whose commit
98 /// timestamp is `<= read_ts`, or `None` if there is no such version or the
99 /// newest visible version is a tombstone (the key was deleted as of
100 /// `read_ts`).
101 ///
102 /// # Errors
103 ///
104 /// Returns [`TxnError::Store`](crate::TxnError::Store) if the backend fails
105 /// to service the read. [`MemoryStore`] never fails.
106 fn get(&self, key: &[u8], read_ts: Timestamp) -> Result<Option<Arc<[u8]>>>;
107
108 /// Validate a transaction and, if it does not conflict, apply its writes.
109 ///
110 /// The store must perform the following as one step, atomic with respect to
111 /// any other `try_commit` that touches an overlapping key:
112 ///
113 /// 1. **Validate.** For every key in `writes` and every key in `reads`,
114 /// check that the key has no version with a commit timestamp greater than
115 /// `read_ts` — that is, that nothing the transaction wrote or read has
116 /// changed since its snapshot. `reads` is empty for snapshot-isolation
117 /// transactions and carries the read set for serializable ones.
118 /// 2. **Apply.** If validation passes, install each write in `writes` as a
119 /// new version stamped `commit_ts` (`Some` is a value, `None` a
120 /// tombstone). The database guarantees `commit_ts` is unique and that
121 /// timestamps are handed out in increasing order.
122 ///
123 /// If any key fails validation, the store applies nothing and reports the
124 /// conflict.
125 ///
126 /// # Errors
127 ///
128 /// Returns [`TxnError::Conflict`](crate::TxnError::Conflict) if validation
129 /// fails; no writes are applied. Returns
130 /// [`TxnError::Store`](crate::TxnError::Store) if the backend fails to apply
131 /// the batch.
132 fn try_commit(
133 &self,
134 read_ts: Timestamp,
135 commit_ts: Timestamp,
136 writes: Vec<WriteEntry>,
137 reads: &[Arc<[u8]>],
138 ) -> Result<()>;
139
140 /// Reclaim versions that no reader at or after `low_watermark` can observe,
141 /// returning how many were removed.
142 ///
143 /// For each key, the newest version with a commit timestamp at or below
144 /// `low_watermark` is the oldest one any live snapshot can still see;
145 /// versions older than it are unreachable and may be dropped. A key whose
146 /// only surviving version is a tombstone at or below the watermark may be
147 /// removed entirely.
148 ///
149 /// The default implementation does nothing, so a store that does not retain
150 /// history — or chooses not to collect — need not implement it. [`MemoryStore`]
151 /// overrides it.
152 fn collect_garbage(&self, low_watermark: Timestamp) -> usize {
153 let _ = low_watermark;
154 0
155 }
156}
157
158/// One stored version of a key: the timestamp it became visible and its value.
159///
160/// A `value` of `None` is a tombstone — the key was deleted at `commit_ts`.
161#[derive(Debug, Clone)]
162struct Version {
163 commit_ts: Timestamp,
164 value: Option<Arc<[u8]>>,
165}
166
167/// One shard's map from key to its version chain, kept in ascending
168/// commit-timestamp order.
169type Chains = HashMap<Arc<[u8]>, Vec<Version>>;
170
171/// One shard's slice of the keyspace.
172struct Shard {
173 chains: RwLock<Chains>,
174}
175
176/// An in-memory [`VersionStore`] that shards the keyspace for concurrency.
177///
178/// Each key is hashed to one of a fixed number of shards; each shard holds its
179/// keys' version chains behind its own reader-writer lock. Reads lock a single
180/// shard; a commit locks only the shards its keys fall in. Commits to disjoint
181/// shards therefore run in parallel, and the snapshot read of a key is a binary
182/// search within its shard for the newest version at or below the snapshot
183/// timestamp.
184///
185/// This is the default store of [`Db::new`](crate::Db::new) and suits caches,
186/// tests, and workloads that fit in memory. Versions accumulate until garbage
187/// collection lands (a later roadmap phase), so a long-lived store under heavy
188/// overwrite grows without bound for now.
189///
190/// # Examples
191///
192/// ```
193/// use txn_db::{Db, MemoryStore};
194///
195/// // `Db::new()` uses a `MemoryStore`; this is the explicit form.
196/// let db = Db::with_store(MemoryStore::new());
197/// let mut tx = db.begin();
198/// tx.put(b"hello".to_vec(), b"world".to_vec());
199/// tx.commit()?;
200/// # Ok::<(), txn_db::TxnError>(())
201/// ```
202pub struct MemoryStore {
203 shards: Box<[Shard]>,
204 /// `shard_count - 1`; ANDed with a key hash to pick a shard.
205 mask: usize,
206}
207
208impl Default for MemoryStore {
209 fn default() -> Self {
210 MemoryStore::new()
211 }
212}
213
214impl MemoryStore {
215 /// Create an empty in-memory store with the default shard count.
216 ///
217 /// # Examples
218 ///
219 /// ```
220 /// use txn_db::MemoryStore;
221 ///
222 /// let store = MemoryStore::new();
223 /// # let _ = store;
224 /// ```
225 #[must_use]
226 pub fn new() -> Self {
227 MemoryStore::with_shards(DEFAULT_SHARDS)
228 }
229
230 /// Create an empty store with a specific number of shards.
231 ///
232 /// `shards` is rounded up to a power of two (and at least one). More shards
233 /// reduce contention between commits that touch unrelated keys, at the cost
234 /// of a larger fixed footprint. The default of [`MemoryStore::new`] suits
235 /// most workloads; tune this only with a benchmark in hand.
236 ///
237 /// # Examples
238 ///
239 /// ```
240 /// use txn_db::MemoryStore;
241 ///
242 /// let store = MemoryStore::with_shards(64);
243 /// # let _ = store;
244 /// ```
245 #[must_use]
246 pub fn with_shards(shards: usize) -> Self {
247 let count = shards.max(1).next_power_of_two();
248 let shards = (0..count)
249 .map(|_| Shard {
250 chains: RwLock::new(HashMap::new()),
251 })
252 .collect::<Vec<_>>()
253 .into_boxed_slice();
254 MemoryStore {
255 shards,
256 mask: count - 1,
257 }
258 }
259
260 /// Number of distinct keys that have ever been written.
261 ///
262 /// Counts keys, not versions, and includes keys whose latest version is a
263 /// tombstone. Primarily useful in tests and diagnostics.
264 ///
265 /// # Examples
266 ///
267 /// ```
268 /// use txn_db::MemoryStore;
269 ///
270 /// let store = MemoryStore::new();
271 /// assert_eq!(store.key_count(), 0);
272 /// ```
273 #[must_use]
274 pub fn key_count(&self) -> usize {
275 self.shards
276 .iter()
277 .map(|shard| sync::read(&shard.chains).len())
278 .sum()
279 }
280
281 /// The shard a key belongs to.
282 #[inline]
283 fn shard_of(&self, key: &[u8]) -> usize {
284 (hash_key(key) as usize) & self.mask
285 }
286
287 /// Install a recovered version directly, without conflict validation.
288 ///
289 /// Used only during durability recovery, replaying a committed transaction
290 /// from the log. The caller installs recovered commits in ascending
291 /// commit-timestamp order, so each version is appended to the end of its
292 /// chain and the ascending invariant is preserved.
293 #[cfg(feature = "durability")]
294 pub(crate) fn install_recovered(&self, commit_ts: Timestamp, writes: Vec<WriteEntry>) {
295 for (key, value) in writes {
296 let shard = self.shard_of(&key);
297 sync::write(&self.shards[shard].chains)
298 .entry(key)
299 .or_default()
300 .push(Version { commit_ts, value });
301 }
302 }
303}
304
305impl VersionStore for MemoryStore {
306 fn get(&self, key: &[u8], read_ts: Timestamp) -> Result<Option<Arc<[u8]>>> {
307 let shard = &self.shards[self.shard_of(key)];
308 let chains = sync::read(&shard.chains);
309 Ok(visible_value(chains.get(key), read_ts))
310 }
311
312 fn try_commit(
313 &self,
314 read_ts: Timestamp,
315 commit_ts: Timestamp,
316 writes: Vec<WriteEntry>,
317 reads: &[Arc<[u8]>],
318 ) -> Result<()> {
319 // Shard of every touched key, computed once.
320 let write_shards: Vec<usize> = writes.iter().map(|(k, _)| self.shard_of(k)).collect();
321 let read_shards: Vec<usize> = reads.iter().map(|k| self.shard_of(k)).collect();
322
323 // The distinct shards to lock, in ascending order so concurrent commits
324 // acquire shared shards in the same sequence and cannot deadlock.
325 let mut to_lock: Vec<usize> = write_shards
326 .iter()
327 .copied()
328 .chain(read_shards.iter().copied())
329 .collect();
330 to_lock.sort_unstable();
331 to_lock.dedup();
332
333 let mut guards: Vec<RwLockWriteGuard<'_, Chains>> = Vec::with_capacity(to_lock.len());
334 for &shard in &to_lock {
335 guards.push(sync::write(&self.shards[shard].chains));
336 }
337
338 // Validate the write set, then the read set: abort if any touched key
339 // gained a version after the transaction's snapshot.
340 for (entry, &shard) in writes.iter().zip(&write_shards) {
341 if let Ok(pos) = to_lock.binary_search(&shard) {
342 if newer_than(guards[pos].get(entry.0.as_ref()), read_ts) {
343 return Err(TxnError::conflict(entry.0.len()));
344 }
345 }
346 }
347 for (key, &shard) in reads.iter().zip(&read_shards) {
348 if let Ok(pos) = to_lock.binary_search(&shard) {
349 if newer_than(guards[pos].get(key.as_ref()), read_ts) {
350 return Err(TxnError::conflict(key.len()));
351 }
352 }
353 }
354
355 // Apply: append a new version for each write under the held locks.
356 for ((key, value), &shard) in writes.into_iter().zip(&write_shards) {
357 if let Ok(pos) = to_lock.binary_search(&shard) {
358 guards[pos]
359 .entry(key)
360 .or_default()
361 .push(Version { commit_ts, value });
362 }
363 }
364 Ok(())
365 }
366
367 fn collect_garbage(&self, low_watermark: Timestamp) -> usize {
368 let mut reclaimed = 0;
369 for shard in &self.shards {
370 let mut chains = sync::write(&shard.chains);
371 chains.retain(|_key, chain| {
372 // Versions at or below the watermark; the last of them is the
373 // oldest any live snapshot can still observe.
374 let visible = chain.partition_point(|v| v.commit_ts <= low_watermark);
375 if visible > 1 {
376 // Drop everything before that oldest-observable version.
377 reclaimed += visible - 1;
378 let _ = chain.drain(0..visible - 1);
379 }
380 // A key whose only surviving version is a tombstone the watermark
381 // has passed is absent for every live reader: drop it entirely.
382 if chain.len() == 1
383 && chain[0].commit_ts <= low_watermark
384 && chain[0].value.is_none()
385 {
386 reclaimed += 1;
387 false
388 } else {
389 true
390 }
391 });
392 }
393 reclaimed
394 }
395}
396
397/// Whether `key`'s newest version (if any) was committed after `read_ts` — the
398/// condition that makes a commit conflict.
399#[inline]
400fn newer_than(versions: Option<&Vec<Version>>, read_ts: Timestamp) -> bool {
401 matches!(versions.and_then(|v| v.last()), Some(v) if v.commit_ts > read_ts)
402}
403
404/// The value of the newest version at or below `read_ts`, or `None` if there is
405/// none or it is a tombstone.
406#[inline]
407fn visible_value(versions: Option<&Vec<Version>>, read_ts: Timestamp) -> Option<Arc<[u8]>> {
408 let versions = versions?;
409 // Versions are ascending by commit timestamp; the newest visible one is the
410 // last entry whose timestamp is `<= read_ts`.
411 let visible = versions.partition_point(|v| v.commit_ts <= read_ts);
412 let idx = visible.checked_sub(1)?;
413 versions[idx].value.clone()
414}
415
416/// FNV-1a hash of a key, used only to pick a shard. A non-cryptographic spread
417/// is all the shard index needs.
418#[inline]
419fn hash_key(key: &[u8]) -> u64 {
420 let mut hash = 0xcbf2_9ce4_8422_2325;
421 for &byte in key {
422 hash ^= u64::from(byte);
423 hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
424 }
425 hash
426}
427
428#[cfg(all(test, not(loom)))]
429#[allow(clippy::unwrap_used, clippy::expect_used)]
430mod tests {
431 use super::*;
432
433 fn k(b: &[u8]) -> Arc<[u8]> {
434 Arc::from(b)
435 }
436
437 fn commit(store: &MemoryStore, ts: u64, writes: Vec<WriteEntry>) {
438 store
439 .try_commit(
440 Timestamp::from_raw(ts - 1),
441 Timestamp::from_raw(ts),
442 writes,
443 &[],
444 )
445 .expect("commit");
446 }
447
448 #[test]
449 fn test_get_on_missing_key_returns_none() {
450 let store = MemoryStore::new();
451 assert_eq!(store.get(b"absent", Timestamp::from_raw(10)).unwrap(), None);
452 }
453
454 #[test]
455 fn test_read_sees_only_versions_at_or_before_snapshot() {
456 let store = MemoryStore::new();
457 commit(&store, 2, vec![(k(b"x"), Some(k(b"a")))]);
458 commit(&store, 4, vec![(k(b"x"), Some(k(b"b")))]);
459
460 assert_eq!(store.get(b"x", Timestamp::from_raw(1)).unwrap(), None);
461 assert_eq!(
462 store.get(b"x", Timestamp::from_raw(2)).unwrap().as_deref(),
463 Some(&b"a"[..])
464 );
465 assert_eq!(
466 store.get(b"x", Timestamp::from_raw(3)).unwrap().as_deref(),
467 Some(&b"a"[..])
468 );
469 assert_eq!(
470 store.get(b"x", Timestamp::from_raw(4)).unwrap().as_deref(),
471 Some(&b"b"[..])
472 );
473 assert_eq!(
474 store.get(b"x", Timestamp::from_raw(99)).unwrap().as_deref(),
475 Some(&b"b"[..])
476 );
477 }
478
479 #[test]
480 fn test_tombstone_reads_as_absent() {
481 let store = MemoryStore::new();
482 commit(&store, 1, vec![(k(b"x"), Some(k(b"a")))]);
483 commit(&store, 2, vec![(k(b"x"), None)]);
484
485 assert_eq!(
486 store.get(b"x", Timestamp::from_raw(1)).unwrap().as_deref(),
487 Some(&b"a"[..])
488 );
489 assert_eq!(store.get(b"x", Timestamp::from_raw(2)).unwrap(), None);
490 }
491
492 #[test]
493 fn test_write_write_conflict_is_detected() {
494 let store = MemoryStore::new();
495 commit(&store, 5, vec![(k(b"x"), Some(k(b"a")))]);
496
497 // A transaction whose snapshot predates the existing version conflicts.
498 let err = store
499 .try_commit(
500 Timestamp::from_raw(4),
501 Timestamp::from_raw(6),
502 vec![(k(b"x"), Some(k(b"b")))],
503 &[],
504 )
505 .unwrap_err();
506 assert!(matches!(err, TxnError::Conflict { .. }));
507 // Nothing was applied.
508 assert_eq!(
509 store.get(b"x", Timestamp::from_raw(99)).unwrap().as_deref(),
510 Some(&b"a"[..])
511 );
512 }
513
514 #[test]
515 fn test_read_set_validation_detects_skew() {
516 let store = MemoryStore::new();
517 commit(&store, 5, vec![(k(b"y"), Some(k(b"1")))]);
518
519 // Snapshot 4, write x, but read y which changed at ts 5 -> conflict.
520 let err = store
521 .try_commit(
522 Timestamp::from_raw(4),
523 Timestamp::from_raw(6),
524 vec![(k(b"x"), Some(k(b"a")))],
525 &[k(b"y")],
526 )
527 .unwrap_err();
528 assert!(matches!(err, TxnError::Conflict { .. }));
529 }
530
531 #[test]
532 fn test_multi_shard_commit_applies_all_keys() {
533 let store = MemoryStore::with_shards(8);
534 let writes: Vec<WriteEntry> = (0u8..32).map(|i| (k(&[i]), Some(k(&[i])))).collect();
535 commit(&store, 1, writes);
536 for i in 0u8..32 {
537 assert_eq!(
538 store.get(&[i], Timestamp::from_raw(1)).unwrap().as_deref(),
539 Some(&[i][..])
540 );
541 }
542 assert_eq!(store.key_count(), 32);
543 }
544
545 #[test]
546 fn test_with_shards_rounds_up_to_power_of_two() {
547 let store = MemoryStore::with_shards(5);
548 assert_eq!(store.shards.len(), 8);
549 assert_eq!(store.mask, 7);
550 }
551
552 #[test]
553 fn test_gc_prunes_versions_below_watermark_but_keeps_newest_visible() {
554 let store = MemoryStore::new();
555 commit(&store, 1, vec![(k(b"x"), Some(k(b"a")))]);
556 commit(&store, 2, vec![(k(b"x"), Some(k(b"b")))]);
557 commit(&store, 3, vec![(k(b"x"), Some(k(b"c")))]);
558
559 // A reader at timestamp 2 must still see "b", so GC at watermark 2 keeps
560 // the version at 2 and everything newer, dropping only the version at 1.
561 let reclaimed = store.collect_garbage(Timestamp::from_raw(2));
562 assert_eq!(reclaimed, 1);
563 assert_eq!(
564 store.get(b"x", Timestamp::from_raw(2)).unwrap().as_deref(),
565 Some(&b"b"[..])
566 );
567 assert_eq!(
568 store.get(b"x", Timestamp::from_raw(3)).unwrap().as_deref(),
569 Some(&b"c"[..])
570 );
571 }
572
573 #[test]
574 fn test_gc_drops_key_whose_only_survivor_is_a_passed_tombstone() {
575 let store = MemoryStore::new();
576 commit(&store, 1, vec![(k(b"x"), Some(k(b"a")))]);
577 commit(&store, 2, vec![(k(b"x"), None)]); // delete
578
579 // At watermark 5 the key is absent for everyone; it is dropped whole.
580 let reclaimed = store.collect_garbage(Timestamp::from_raw(5));
581 assert_eq!(reclaimed, 2);
582 assert_eq!(store.key_count(), 0);
583 }
584
585 #[test]
586 fn test_gc_keeps_everything_above_watermark() {
587 let store = MemoryStore::new();
588 commit(&store, 5, vec![(k(b"x"), Some(k(b"a")))]);
589 commit(&store, 6, vec![(k(b"x"), Some(k(b"b")))]);
590
591 // A watermark below all versions reclaims nothing.
592 assert_eq!(store.collect_garbage(Timestamp::from_raw(4)), 0);
593 assert_eq!(
594 store.get(b"x", Timestamp::from_raw(5)).unwrap().as_deref(),
595 Some(&b"a"[..])
596 );
597 }
598
599 #[test]
600 fn test_default_trait_gc_is_noop() {
601 // A bare trait object using the default never reclaims.
602 struct NoHistory;
603 impl VersionStore for NoHistory {
604 fn get(&self, _: &[u8], _: Timestamp) -> Result<Option<Arc<[u8]>>> {
605 Ok(None)
606 }
607 fn try_commit(
608 &self,
609 _: Timestamp,
610 _: Timestamp,
611 _: Vec<WriteEntry>,
612 _: &[Arc<[u8]>],
613 ) -> Result<()> {
614 Ok(())
615 }
616 }
617 assert_eq!(NoHistory.collect_garbage(Timestamp::from_raw(100)), 0);
618 }
619}