Skip to main content

txn_db/
store.rs

1//! The version store: where committed versions live.
2//!
3//! `txn-db` is the transaction layer, not the storage layer. It owns
4//! visibility, conflict detection, and commit ordering, but it delegates the
5//! actual keeping of versioned bytes to a [`VersionStore`]. That trait is the
6//! crate's Tier-3 seam: implement it over an LSM tree, a B-tree, a remote
7//! service — anything that can keep multiple timestamped versions of a key —
8//! and the transaction semantics compose on top unchanged.
9//!
10//! A [`MemoryStore`] ships for the common in-process case, for tests, and for
11//! examples. It is the default backing store of [`Db::new`](crate::Db::new).
12//!
13//! ## The contract a store must uphold
14//!
15//! A correct [`VersionStore`] keeps, for each key, the history of versions it
16//! has been asked to apply, each tagged with the commit timestamp it was applied
17//! at. Its two obligations are:
18//!
19//! - [`get`](VersionStore::get) returns the *newest* version whose commit
20//!   timestamp is less than or equal to the caller's snapshot timestamp — the
21//!   snapshot-read rule. A tombstone (a delete) at that position reads as
22//!   "absent".
23//! - [`try_commit`](VersionStore::try_commit) validates a transaction's read and
24//!   write sets against its snapshot and, if nothing conflicts, installs its
25//!   writes at the commit timestamp — atomically with respect to other commits
26//!   touching the same keys. This single method is what makes the store the
27//!   serialization point for concurrent commits.
28//!
29//! ## Sharding
30//!
31//! [`MemoryStore`] partitions keys across independent shards, each with its own
32//! lock. Reads and commits that touch disjoint shards proceed without
33//! contending; a commit locks only the shards its keys fall in, in a fixed order
34//! so concurrent commits cannot deadlock. This is the sharded commit path the
35//! single global commit lock of the foundation release grew into.
36
37use std::collections::HashMap;
38use std::sync::Arc;
39
40use crate::error::{Result, TxnError};
41use crate::sync::{self, RwLock, RwLockWriteGuard};
42use crate::timestamp::Timestamp;
43
44/// One entry in a commit batch handed to [`VersionStore::try_commit`].
45///
46/// A key paired with the value to write at the commit timestamp (`Some`) or a
47/// tombstone marking a delete (`None`).
48pub type WriteEntry = (Arc<[u8]>, Option<Arc<[u8]>>);
49
50/// Default number of shards. A power of two so the shard index is a mask, not a
51/// division. Sixteen spreads contention well for in-process workloads without
52/// the per-commit cost of locking a long list of shards. Loom builds use far
53/// fewer to keep the interleaving search tractable.
54#[cfg(not(loom))]
55const DEFAULT_SHARDS: usize = 16;
56#[cfg(loom)]
57const DEFAULT_SHARDS: usize = 2;
58
59/// A keeper of timestamped versions, the backend a [`Db`](crate::Db) is built on.
60///
61/// This is the extension point for plugging `txn-db` onto a real storage
62/// engine. The transaction layer supplies the snapshot timestamps and the read
63/// and write sets; the store stores versions and enforces, atomically, that a
64/// commit only lands when nothing it depends on has changed. The two methods
65/// below state the precise contract.
66///
67/// Implementations must be `Send + Sync`: a [`Db`](crate::Db) shares one store
68/// across every thread that holds a clone of it.
69///
70/// # Examples
71///
72/// Driving the shipped [`MemoryStore`] directly through the trait:
73///
74/// ```
75/// use std::sync::Arc;
76/// use txn_db::{MemoryStore, Timestamp, VersionStore};
77///
78/// let store = MemoryStore::new();
79/// let key: Arc<[u8]> = Arc::from(&b"k"[..]);
80///
81/// // Commit one version at timestamp 1 (snapshot 0, no reads to validate).
82/// store.try_commit(
83///     Timestamp::ZERO,
84///     Timestamp::from_raw(1),
85///     vec![(key.clone(), Some(Arc::from(&b"v1"[..])))],
86///     &[],
87/// )?;
88///
89/// // A reader at timestamp 1 sees it; a reader at timestamp 0 does not.
90/// assert_eq!(store.get(b"k", Timestamp::from_raw(1))?.as_deref(), Some(&b"v1"[..]));
91/// assert_eq!(store.get(b"k", Timestamp::ZERO)?, None);
92/// # Ok::<(), txn_db::TxnError>(())
93/// ```
94pub trait VersionStore: Send + Sync {
95    /// Return the value of `key` visible at `read_ts`.
96    ///
97    /// The result is the value of the newest version of `key` whose commit
98    /// timestamp is `<= read_ts`, or `None` if there is no such version or the
99    /// newest visible version is a tombstone (the key was deleted as of
100    /// `read_ts`).
101    ///
102    /// # Errors
103    ///
104    /// Returns [`TxnError::Store`](crate::TxnError::Store) if the backend fails
105    /// to service the read. [`MemoryStore`] never fails.
106    fn get(&self, key: &[u8], read_ts: Timestamp) -> Result<Option<Arc<[u8]>>>;
107
108    /// Validate a transaction and, if it does not conflict, apply its writes.
109    ///
110    /// The store must perform the following as one step, atomic with respect to
111    /// any other `try_commit` that touches an overlapping key:
112    ///
113    /// 1. **Validate.** For every key in `writes` and every key in `reads`,
114    ///    check that the key has no version with a commit timestamp greater than
115    ///    `read_ts` — that is, that nothing the transaction wrote or read has
116    ///    changed since its snapshot. `reads` is empty for snapshot-isolation
117    ///    transactions and carries the read set for serializable ones.
118    /// 2. **Apply.** If validation passes, install each write in `writes` as a
119    ///    new version stamped `commit_ts` (`Some` is a value, `None` a
120    ///    tombstone). The database guarantees `commit_ts` is unique and that
121    ///    timestamps are handed out in increasing order.
122    ///
123    /// If any key fails validation, the store applies nothing and reports the
124    /// conflict.
125    ///
126    /// # Errors
127    ///
128    /// Returns [`TxnError::Conflict`](crate::TxnError::Conflict) if validation
129    /// fails; no writes are applied. Returns
130    /// [`TxnError::Store`](crate::TxnError::Store) if the backend fails to apply
131    /// the batch.
132    fn try_commit(
133        &self,
134        read_ts: Timestamp,
135        commit_ts: Timestamp,
136        writes: Vec<WriteEntry>,
137        reads: &[Arc<[u8]>],
138    ) -> Result<()>;
139}
140
141/// One stored version of a key: the timestamp it became visible and its value.
142///
143/// A `value` of `None` is a tombstone — the key was deleted at `commit_ts`.
144#[derive(Debug, Clone)]
145struct Version {
146    commit_ts: Timestamp,
147    value: Option<Arc<[u8]>>,
148}
149
150/// One shard's map from key to its version chain, kept in ascending
151/// commit-timestamp order.
152type Chains = HashMap<Arc<[u8]>, Vec<Version>>;
153
154/// One shard's slice of the keyspace.
155struct Shard {
156    chains: RwLock<Chains>,
157}
158
159/// An in-memory [`VersionStore`] that shards the keyspace for concurrency.
160///
161/// Each key is hashed to one of a fixed number of shards; each shard holds its
162/// keys' version chains behind its own reader-writer lock. Reads lock a single
163/// shard; a commit locks only the shards its keys fall in. Commits to disjoint
164/// shards therefore run in parallel, and the snapshot read of a key is a binary
165/// search within its shard for the newest version at or below the snapshot
166/// timestamp.
167///
168/// This is the default store of [`Db::new`](crate::Db::new) and suits caches,
169/// tests, and workloads that fit in memory. Versions accumulate until garbage
170/// collection lands (a later roadmap phase), so a long-lived store under heavy
171/// overwrite grows without bound for now.
172///
173/// # Examples
174///
175/// ```
176/// use txn_db::{Db, MemoryStore};
177///
178/// // `Db::new()` uses a `MemoryStore`; this is the explicit form.
179/// let db = Db::with_store(MemoryStore::new());
180/// let mut tx = db.begin();
181/// tx.put(b"hello".to_vec(), b"world".to_vec());
182/// tx.commit()?;
183/// # Ok::<(), txn_db::TxnError>(())
184/// ```
185pub struct MemoryStore {
186    shards: Box<[Shard]>,
187    /// `shard_count - 1`; ANDed with a key hash to pick a shard.
188    mask: usize,
189}
190
191impl Default for MemoryStore {
192    fn default() -> Self {
193        MemoryStore::new()
194    }
195}
196
197impl MemoryStore {
198    /// Create an empty in-memory store with the default shard count.
199    ///
200    /// # Examples
201    ///
202    /// ```
203    /// use txn_db::MemoryStore;
204    ///
205    /// let store = MemoryStore::new();
206    /// # let _ = store;
207    /// ```
208    #[must_use]
209    pub fn new() -> Self {
210        MemoryStore::with_shards(DEFAULT_SHARDS)
211    }
212
213    /// Create an empty store with a specific number of shards.
214    ///
215    /// `shards` is rounded up to a power of two (and at least one). More shards
216    /// reduce contention between commits that touch unrelated keys, at the cost
217    /// of a larger fixed footprint. The default of [`MemoryStore::new`] suits
218    /// most workloads; tune this only with a benchmark in hand.
219    ///
220    /// # Examples
221    ///
222    /// ```
223    /// use txn_db::MemoryStore;
224    ///
225    /// let store = MemoryStore::with_shards(64);
226    /// # let _ = store;
227    /// ```
228    #[must_use]
229    pub fn with_shards(shards: usize) -> Self {
230        let count = shards.max(1).next_power_of_two();
231        let shards = (0..count)
232            .map(|_| Shard {
233                chains: RwLock::new(HashMap::new()),
234            })
235            .collect::<Vec<_>>()
236            .into_boxed_slice();
237        MemoryStore {
238            shards,
239            mask: count - 1,
240        }
241    }
242
243    /// Number of distinct keys that have ever been written.
244    ///
245    /// Counts keys, not versions, and includes keys whose latest version is a
246    /// tombstone. Primarily useful in tests and diagnostics.
247    ///
248    /// # Examples
249    ///
250    /// ```
251    /// use txn_db::MemoryStore;
252    ///
253    /// let store = MemoryStore::new();
254    /// assert_eq!(store.key_count(), 0);
255    /// ```
256    #[must_use]
257    pub fn key_count(&self) -> usize {
258        self.shards
259            .iter()
260            .map(|shard| sync::read(&shard.chains).len())
261            .sum()
262    }
263
264    /// The shard a key belongs to.
265    #[inline]
266    fn shard_of(&self, key: &[u8]) -> usize {
267        (hash_key(key) as usize) & self.mask
268    }
269
270    /// Install a recovered version directly, without conflict validation.
271    ///
272    /// Used only during durability recovery, replaying a committed transaction
273    /// from the log. The caller installs recovered commits in ascending
274    /// commit-timestamp order, so each version is appended to the end of its
275    /// chain and the ascending invariant is preserved.
276    #[cfg(feature = "durability")]
277    pub(crate) fn install_recovered(&self, commit_ts: Timestamp, writes: Vec<WriteEntry>) {
278        for (key, value) in writes {
279            let shard = self.shard_of(&key);
280            sync::write(&self.shards[shard].chains)
281                .entry(key)
282                .or_default()
283                .push(Version { commit_ts, value });
284        }
285    }
286}
287
288impl VersionStore for MemoryStore {
289    fn get(&self, key: &[u8], read_ts: Timestamp) -> Result<Option<Arc<[u8]>>> {
290        let shard = &self.shards[self.shard_of(key)];
291        let chains = sync::read(&shard.chains);
292        Ok(visible_value(chains.get(key), read_ts))
293    }
294
295    fn try_commit(
296        &self,
297        read_ts: Timestamp,
298        commit_ts: Timestamp,
299        writes: Vec<WriteEntry>,
300        reads: &[Arc<[u8]>],
301    ) -> Result<()> {
302        // Shard of every touched key, computed once.
303        let write_shards: Vec<usize> = writes.iter().map(|(k, _)| self.shard_of(k)).collect();
304        let read_shards: Vec<usize> = reads.iter().map(|k| self.shard_of(k)).collect();
305
306        // The distinct shards to lock, in ascending order so concurrent commits
307        // acquire shared shards in the same sequence and cannot deadlock.
308        let mut to_lock: Vec<usize> = write_shards
309            .iter()
310            .copied()
311            .chain(read_shards.iter().copied())
312            .collect();
313        to_lock.sort_unstable();
314        to_lock.dedup();
315
316        let mut guards: Vec<RwLockWriteGuard<'_, Chains>> = Vec::with_capacity(to_lock.len());
317        for &shard in &to_lock {
318            guards.push(sync::write(&self.shards[shard].chains));
319        }
320
321        // Validate the write set, then the read set: abort if any touched key
322        // gained a version after the transaction's snapshot.
323        for (entry, &shard) in writes.iter().zip(&write_shards) {
324            if let Ok(pos) = to_lock.binary_search(&shard) {
325                if newer_than(guards[pos].get(entry.0.as_ref()), read_ts) {
326                    return Err(TxnError::conflict(entry.0.len()));
327                }
328            }
329        }
330        for (key, &shard) in reads.iter().zip(&read_shards) {
331            if let Ok(pos) = to_lock.binary_search(&shard) {
332                if newer_than(guards[pos].get(key.as_ref()), read_ts) {
333                    return Err(TxnError::conflict(key.len()));
334                }
335            }
336        }
337
338        // Apply: append a new version for each write under the held locks.
339        for ((key, value), &shard) in writes.into_iter().zip(&write_shards) {
340            if let Ok(pos) = to_lock.binary_search(&shard) {
341                guards[pos]
342                    .entry(key)
343                    .or_default()
344                    .push(Version { commit_ts, value });
345            }
346        }
347        Ok(())
348    }
349}
350
351/// Whether `key`'s newest version (if any) was committed after `read_ts` — the
352/// condition that makes a commit conflict.
353#[inline]
354fn newer_than(versions: Option<&Vec<Version>>, read_ts: Timestamp) -> bool {
355    matches!(versions.and_then(|v| v.last()), Some(v) if v.commit_ts > read_ts)
356}
357
358/// The value of the newest version at or below `read_ts`, or `None` if there is
359/// none or it is a tombstone.
360#[inline]
361fn visible_value(versions: Option<&Vec<Version>>, read_ts: Timestamp) -> Option<Arc<[u8]>> {
362    let versions = versions?;
363    // Versions are ascending by commit timestamp; the newest visible one is the
364    // last entry whose timestamp is `<= read_ts`.
365    let visible = versions.partition_point(|v| v.commit_ts <= read_ts);
366    let idx = visible.checked_sub(1)?;
367    versions[idx].value.clone()
368}
369
370/// FNV-1a hash of a key, used only to pick a shard. A non-cryptographic spread
371/// is all the shard index needs.
372#[inline]
373fn hash_key(key: &[u8]) -> u64 {
374    let mut hash = 0xcbf2_9ce4_8422_2325;
375    for &byte in key {
376        hash ^= u64::from(byte);
377        hash = hash.wrapping_mul(0x0000_0100_0000_01b3);
378    }
379    hash
380}
381
382#[cfg(all(test, not(loom)))]
383#[allow(clippy::unwrap_used, clippy::expect_used)]
384mod tests {
385    use super::*;
386
387    fn k(b: &[u8]) -> Arc<[u8]> {
388        Arc::from(b)
389    }
390
391    fn commit(store: &MemoryStore, ts: u64, writes: Vec<WriteEntry>) {
392        store
393            .try_commit(
394                Timestamp::from_raw(ts - 1),
395                Timestamp::from_raw(ts),
396                writes,
397                &[],
398            )
399            .expect("commit");
400    }
401
402    #[test]
403    fn test_get_on_missing_key_returns_none() {
404        let store = MemoryStore::new();
405        assert_eq!(store.get(b"absent", Timestamp::from_raw(10)).unwrap(), None);
406    }
407
408    #[test]
409    fn test_read_sees_only_versions_at_or_before_snapshot() {
410        let store = MemoryStore::new();
411        commit(&store, 2, vec![(k(b"x"), Some(k(b"a")))]);
412        commit(&store, 4, vec![(k(b"x"), Some(k(b"b")))]);
413
414        assert_eq!(store.get(b"x", Timestamp::from_raw(1)).unwrap(), None);
415        assert_eq!(
416            store.get(b"x", Timestamp::from_raw(2)).unwrap().as_deref(),
417            Some(&b"a"[..])
418        );
419        assert_eq!(
420            store.get(b"x", Timestamp::from_raw(3)).unwrap().as_deref(),
421            Some(&b"a"[..])
422        );
423        assert_eq!(
424            store.get(b"x", Timestamp::from_raw(4)).unwrap().as_deref(),
425            Some(&b"b"[..])
426        );
427        assert_eq!(
428            store.get(b"x", Timestamp::from_raw(99)).unwrap().as_deref(),
429            Some(&b"b"[..])
430        );
431    }
432
433    #[test]
434    fn test_tombstone_reads_as_absent() {
435        let store = MemoryStore::new();
436        commit(&store, 1, vec![(k(b"x"), Some(k(b"a")))]);
437        commit(&store, 2, vec![(k(b"x"), None)]);
438
439        assert_eq!(
440            store.get(b"x", Timestamp::from_raw(1)).unwrap().as_deref(),
441            Some(&b"a"[..])
442        );
443        assert_eq!(store.get(b"x", Timestamp::from_raw(2)).unwrap(), None);
444    }
445
446    #[test]
447    fn test_write_write_conflict_is_detected() {
448        let store = MemoryStore::new();
449        commit(&store, 5, vec![(k(b"x"), Some(k(b"a")))]);
450
451        // A transaction whose snapshot predates the existing version conflicts.
452        let err = store
453            .try_commit(
454                Timestamp::from_raw(4),
455                Timestamp::from_raw(6),
456                vec![(k(b"x"), Some(k(b"b")))],
457                &[],
458            )
459            .unwrap_err();
460        assert!(matches!(err, TxnError::Conflict { .. }));
461        // Nothing was applied.
462        assert_eq!(
463            store.get(b"x", Timestamp::from_raw(99)).unwrap().as_deref(),
464            Some(&b"a"[..])
465        );
466    }
467
468    #[test]
469    fn test_read_set_validation_detects_skew() {
470        let store = MemoryStore::new();
471        commit(&store, 5, vec![(k(b"y"), Some(k(b"1")))]);
472
473        // Snapshot 4, write x, but read y which changed at ts 5 -> conflict.
474        let err = store
475            .try_commit(
476                Timestamp::from_raw(4),
477                Timestamp::from_raw(6),
478                vec![(k(b"x"), Some(k(b"a")))],
479                &[k(b"y")],
480            )
481            .unwrap_err();
482        assert!(matches!(err, TxnError::Conflict { .. }));
483    }
484
485    #[test]
486    fn test_multi_shard_commit_applies_all_keys() {
487        let store = MemoryStore::with_shards(8);
488        let writes: Vec<WriteEntry> = (0u8..32).map(|i| (k(&[i]), Some(k(&[i])))).collect();
489        commit(&store, 1, writes);
490        for i in 0u8..32 {
491            assert_eq!(
492                store.get(&[i], Timestamp::from_raw(1)).unwrap().as_deref(),
493                Some(&[i][..])
494            );
495        }
496        assert_eq!(store.key_count(), 32);
497    }
498
499    #[test]
500    fn test_with_shards_rounds_up_to_power_of_two() {
501        let store = MemoryStore::with_shards(5);
502        assert_eq!(store.shards.len(), 8);
503        assert_eq!(store.mask, 7);
504    }
505}