txn-db 1.0.0

MVCC transaction engine for Rust storage layers. Snapshot isolation and serializable transactions with multi-version concurrency control, conflict detection, and a durable transaction log on wal-db. The transaction layer for embedded databases and Hive DB.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
//! The database handle and the commit coordinator behind it.
//!
//! [`Db`] is the Tier-1 entry point: construct one, [`begin`](Db::begin)
//! transactions against it, [`commit`](crate::Transaction::commit) them. A `Db`
//! is a cheap, clonable handle to shared state — clone it freely and hand a
//! clone to every thread that needs to read or write.
//!
//! The shared state itself lives in [`Inner`], which owns the version store and
//! the [`Oracle`](crate::oracle::Oracle) that allocates timestamps and tracks
//! the read watermark. Commit coordination is split deliberately: the oracle
//! hands out timestamps lock-free, and the version store is the serialization
//! point that validates and applies each commit atomically. The single global
//! commit lock of the foundation release is gone.

use std::sync::Arc;

use crate::error::Result;
use crate::oracle::Oracle;
use crate::store::{MemoryStore, VersionStore, WriteEntry};
use crate::timestamp::Timestamp;
use crate::txn::{Snapshot, Transaction};

/// Shared, reference-counted state for one logical database.
///
/// A [`Db`] is a handle to an `Arc<Inner>`; every clone of the `Db`, every
/// [`Transaction`], and every [`Snapshot`] holds a clone of the same `Inner`,
/// so they all read and commit against one version store and one timestamp
/// sequence.
pub(crate) struct Inner<S: VersionStore> {
    /// The backing version store. Reads go to it; commits validate and apply
    /// through it.
    pub(crate) store: S,
    /// Allocates timestamps and tracks the consistent-read watermark.
    oracle: Oracle,
    /// The durable commit log, present only for a database opened with
    /// [`Db::open`]. `None` for an in-memory database.
    #[cfg(feature = "durability")]
    log: Option<crate::durable::CommitLog>,
}

impl<S: VersionStore> Inner<S> {
    fn new(store: S) -> Self {
        Inner {
            store,
            oracle: Oracle::new(),
            #[cfg(feature = "durability")]
            log: None,
        }
    }

    /// The timestamp a transaction beginning now should read at.
    #[inline]
    fn read_ts(&self) -> Timestamp {
        self.oracle.read_ts()
    }

    /// Register a live reader and return its read timestamp.
    #[inline]
    pub(crate) fn begin_reader(&self) -> Timestamp {
        self.oracle.begin_reader()
    }

    /// Unregister a reader that began at `read_ts`.
    #[inline]
    pub(crate) fn end_reader(&self, read_ts: Timestamp) {
        self.oracle.end_reader(read_ts);
    }

    /// Reclaim versions no live reader can observe, returning the count removed.
    fn collect_garbage(&self) -> usize {
        self.store.collect_garbage(self.oracle.low_watermark())
    }

    /// Allocate a commit timestamp, validate-and-apply through the store, then
    /// release the timestamp into the watermark.
    ///
    /// The timestamp is reported to the oracle on both outcomes — a successful
    /// commit and a rejected one — so a conflict never stalls the read watermark
    /// behind the timestamp it consumed.
    pub(crate) fn commit_writes(
        &self,
        read_ts: Timestamp,
        writes: Vec<WriteEntry>,
        reads: &[Arc<[u8]>],
    ) -> Result<Timestamp> {
        let commit_ts = self.oracle.alloc_commit_ts();

        // Encode the durable record before the write set is consumed by the
        // store. No cost for an in-memory database (no log).
        #[cfg(feature = "durability")]
        let record = self
            .log
            .as_ref()
            .map(|_| crate::durable::encode_for_log(commit_ts, &writes));

        let outcome = self.store.try_commit(read_ts, commit_ts, writes, reads);

        // Make the commit durable before it is acknowledged. The validate-and-
        // apply has already happened in memory but is not yet visible — the
        // watermark only advances at `commit_done` below — so a crash before the
        // sync completes leaves a transaction that was never acknowledged and is
        // recovered as absent.
        #[cfg(feature = "durability")]
        if outcome.is_ok() {
            if let (Some(log), Some(record)) = (self.log.as_ref(), record) {
                if let Err(err) = log.append_committed(&record) {
                    self.oracle.commit_done(commit_ts);
                    return Err(err);
                }
            }
        }

        self.oracle.commit_done(commit_ts);
        outcome.map(|()| commit_ts)
    }

    /// Build the shared inner state for a database recovered from a durable log.
    #[cfg(feature = "durability")]
    fn recovered(store: S, oracle: Oracle, log: crate::durable::CommitLog) -> Self {
        Inner {
            store,
            oracle,
            log: Some(log),
        }
    }
}

/// A transactional, multi-version key-value database.
///
/// `Db` is the front door. [`Db::new`] gives you an in-memory database;
/// [`Db::with_store`] builds one over any [`VersionStore`]. From there the whole
/// common case is [`begin`](Db::begin) / [`get`](crate::Transaction::get) /
/// [`put`](crate::Transaction::put) / [`commit`](crate::Transaction::commit),
/// with [`snapshot`](Db::snapshot) for read-only point-in-time views.
///
/// Transactions default to **snapshot isolation**. With the `serializable`
/// feature enabled, `begin_serializable` starts a transaction whose read set is
/// validated at commit, rejecting write skew and the other anomalies snapshot
/// isolation permits.
///
/// A `Db` is a clonable handle over shared state, like an [`Arc`]. Cloning it
/// is cheap and every clone refers to the same database, so the idiomatic way
/// to use it across threads is to clone a handle per thread.
///
/// # Examples
///
/// The four-call common case:
///
/// ```
/// use txn_db::Db;
///
/// let db = Db::new();
///
/// let mut tx = db.begin();
/// tx.put(b"greeting".to_vec(), b"hei".to_vec());
/// tx.commit()?;
///
/// let tx = db.begin();
/// assert_eq!(tx.get(b"greeting")?.as_deref(), Some(&b"hei"[..]));
/// # Ok::<(), txn_db::TxnError>(())
/// ```
///
/// Sharing one database across threads:
///
/// ```
/// use std::thread;
/// use txn_db::Db;
///
/// let db = Db::new();
/// let handles: Vec<_> = (0..4u8)
///     .map(|i| {
///         let db = db.clone();
///         thread::spawn(move || {
///             let mut tx = db.begin();
///             tx.put(vec![i], vec![i]);
///             // Independent keys never conflict.
///             tx.commit().expect("commit");
///         })
///     })
///     .collect();
/// for h in handles {
///     h.join().expect("thread");
/// }
/// # Ok::<(), txn_db::TxnError>(())
/// ```
pub struct Db<S: VersionStore = MemoryStore> {
    inner: Arc<Inner<S>>,
}

impl Db<MemoryStore> {
    /// Create an empty in-memory database.
    ///
    /// This is the default configuration: a [`MemoryStore`] backing store, ready
    /// for [`begin`](Db::begin).
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// assert_eq!(db.last_committed(), txn_db::Timestamp::ZERO);
    /// ```
    #[must_use]
    pub fn new() -> Self {
        Db::with_store(MemoryStore::new())
    }

    /// Open a durable database backed by a write-ahead log at `path`, replaying
    /// any committed transactions already in the log.
    ///
    /// Every transaction committed against the returned database appends its
    /// record to the log and syncs it before [`commit`](crate::Transaction::commit)
    /// returns, so an acknowledged commit survives a crash. On open, the log is
    /// replayed: each committed transaction is reinstated, and a transaction that
    /// never reached the log — because it aborted, or because the process crashed
    /// before its record was made durable — is simply absent. The recovered data
    /// lives in memory; the log is the durable record from which it is rebuilt.
    ///
    /// Available with the `durability` feature.
    ///
    /// # Errors
    ///
    /// Returns [`TxnError::Durability`](crate::TxnError::Durability) if the log
    /// cannot be opened or a record read back from it does not decode.
    ///
    /// # Examples
    ///
    /// ```
    /// # #[cfg(feature = "durability")]
    /// # {
    /// # let dir = tempfile::tempdir().expect("tempdir");
    /// # let path = dir.path().join("txn.wal");
    /// use txn_db::Db;
    ///
    /// // Commit, then drop the database.
    /// {
    ///     let db = Db::open(&path)?;
    ///     let mut tx = db.begin();
    ///     tx.put(b"k".to_vec(), b"v".to_vec());
    ///     tx.commit()?;
    /// }
    ///
    /// // Reopening replays the log: the committed write is still there.
    /// let db = Db::open(&path)?;
    /// assert_eq!(db.begin().get(b"k")?.as_deref(), Some(&b"v"[..]));
    /// # }
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    #[cfg(feature = "durability")]
    #[cfg_attr(docsrs, doc(cfg(feature = "durability")))]
    pub fn open(path: impl AsRef<std::path::Path>) -> Result<Db<MemoryStore>> {
        let (log, mut recovered) = crate::durable::CommitLog::open(path)?;

        // Replay in ascending commit-timestamp order; records may sit in the log
        // out of that order because commits append after applying, concurrently.
        recovered.sort_by_key(|commit| commit.commit_ts);

        let store = MemoryStore::new();
        let mut highest = Timestamp::ZERO;
        for commit in recovered {
            highest = highest.max(commit.commit_ts);
            store.install_recovered(commit.commit_ts, commit.writes);
        }

        Ok(Db {
            inner: Arc::new(Inner::recovered(store, Oracle::recovered(highest), log)),
        })
    }
}

impl Default for Db<MemoryStore> {
    fn default() -> Self {
        Db::new()
    }
}

impl<S: VersionStore> Db<S> {
    /// Create a database over a custom [`VersionStore`].
    ///
    /// This is the Tier-3 seam: supply any backing store and the transaction
    /// semantics — snapshot isolation, read-your-own-writes, conflict detection
    /// — compose on top of it unchanged.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::{Db, MemoryStore};
    ///
    /// let db = Db::with_store(MemoryStore::new());
    /// let mut tx = db.begin();
    /// tx.put(b"k".to_vec(), b"v".to_vec());
    /// tx.commit()?;
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    #[must_use]
    pub fn with_store(store: S) -> Self {
        Db {
            inner: Arc::new(Inner::new(store)),
        }
    }

    /// Begin a snapshot-isolation transaction over the current state.
    ///
    /// The transaction takes its snapshot at this moment: it reads as of the
    /// most recent commit and is unaffected by commits that happen afterward.
    /// Its writes are checked for write-write conflicts at commit, but its reads
    /// are not validated — use `begin_serializable` (with the `serializable`
    /// feature) when you need serializability.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// let mut tx = db.begin();
    /// tx.put(b"k".to_vec(), b"v".to_vec());
    /// tx.commit()?;
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn begin(&self) -> Transaction<S> {
        Transaction::new(Arc::clone(&self.inner), false)
    }

    /// Begin a serializable transaction over the current state.
    ///
    /// A serializable transaction tracks every key it reads and, at commit,
    /// validates that none of them changed since its snapshot — in addition to
    /// the write-write check every transaction gets. That read-set validation is
    /// what rejects write skew and the read-only anomaly that plain snapshot
    /// isolation permits, giving serializable behavior for the transactions that
    /// commit writes. A serializable transaction that writes nothing commits
    /// trivially, exactly like a read-only snapshot.
    ///
    /// Available with the `serializable` feature. Snapshot isolation remains the
    /// default and is unaffected.
    ///
    /// # Examples
    ///
    /// ```
    /// # #[cfg(feature = "serializable")]
    /// # {
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// // Seed two rows that an invariant ties together.
    /// let mut tx = db.begin();
    /// tx.put(b"on_call:alice".to_vec(), vec![1]);
    /// tx.put(b"on_call:bob".to_vec(), vec![1]);
    /// tx.commit()?;
    ///
    /// // A serializable transaction validates the rows it read at commit.
    /// let mut tx = db.begin_serializable();
    /// let _alice = tx.get(b"on_call:alice")?;
    /// let _bob = tx.get(b"on_call:bob")?;
    /// tx.put(b"on_call:alice".to_vec(), vec![0]);
    /// tx.commit()?;
    /// # }
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    #[cfg(feature = "serializable")]
    #[cfg_attr(docsrs, doc(cfg(feature = "serializable")))]
    pub fn begin_serializable(&self) -> Transaction<S> {
        Transaction::new(Arc::clone(&self.inner), true)
    }

    /// Take a read-only snapshot of the current state of the database.
    ///
    /// The returned [`Snapshot`] reads as of this instant and never changes,
    /// even as other transactions commit. Use it to read several keys at one
    /// consistent point in time without the overhead of a transaction.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// let snap = db.snapshot();
    /// assert_eq!(snap.get(b"k")?, None);
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn snapshot(&self) -> Snapshot<S> {
        Snapshot::new(Arc::clone(&self.inner))
    }

    /// Read one key without opening a transaction.
    ///
    /// A convenience for the common single-read case: it takes a snapshot of the
    /// current state and reads `key` from it, returning the newest committed
    /// value or `None` if the key is absent. For reading several keys at one
    /// consistent instant, take a [`snapshot`](Db::snapshot) and reuse it.
    ///
    /// # Errors
    ///
    /// Returns [`TxnError::Store`](crate::TxnError::Store) if the backing store
    /// fails the read. The default in-memory store never fails.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// db.put(b"k".to_vec(), b"v".to_vec())?;
    /// assert_eq!(db.get(b"k")?.as_deref(), Some(&b"v"[..]));
    /// assert_eq!(db.get(b"absent")?, None);
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn get(&self, key: &[u8]) -> Result<Option<Arc<[u8]>>> {
        self.snapshot().get(key)
    }

    /// Write one key in its own transaction, retrying on conflict, and return the
    /// commit timestamp.
    ///
    /// A convenience for the common single-write case: it begins a transaction,
    /// buffers the write, and commits. If a concurrent transaction wins the
    /// commit race it retries against a fresher snapshot, so this is
    /// last-writer-wins and never surfaces a conflict — the value is always
    /// installed. When you need to read-then-write atomically, or to control the
    /// conflict outcome yourself, use [`begin`](Db::begin) instead.
    ///
    /// # Errors
    ///
    /// Returns [`TxnError::Store`](crate::TxnError::Store) if the backing store
    /// fails to apply the write, or
    /// [`TxnError::Durability`](crate::TxnError::Durability) for a durable
    /// database whose commit cannot be made durable. Conflicts are retried, not
    /// returned.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// let ts = db.put(b"k".to_vec(), b"v".to_vec())?;
    /// assert!(ts > txn_db::Timestamp::ZERO);
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn put(&self, key: impl Into<Arc<[u8]>>, value: impl Into<Arc<[u8]>>) -> Result<Timestamp> {
        let key = key.into();
        let value = value.into();
        loop {
            let mut tx = self.begin();
            tx.put(Arc::clone(&key), Arc::clone(&value));
            match tx.commit() {
                Ok(ts) => return Ok(ts),
                Err(e) if e.is_retryable() => continue,
                Err(e) => return Err(e),
            }
        }
    }

    /// Delete one key in its own transaction, retrying on conflict, and return
    /// the commit timestamp.
    ///
    /// The delete counterpart of [`put`](Db::put): last-writer-wins, conflicts
    /// retried. After it returns the key reads as absent until written again.
    ///
    /// # Errors
    ///
    /// Returns [`TxnError::Store`](crate::TxnError::Store) if the backing store
    /// fails, or [`TxnError::Durability`](crate::TxnError::Durability) for a
    /// durable database whose commit cannot be made durable. Conflicts are
    /// retried, not returned.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// db.put(b"k".to_vec(), b"v".to_vec())?;
    /// db.delete(b"k".to_vec())?;
    /// assert_eq!(db.get(b"k")?, None);
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn delete(&self, key: impl Into<Arc<[u8]>>) -> Result<Timestamp> {
        let key = key.into();
        loop {
            let mut tx = self.begin();
            tx.delete(Arc::clone(&key));
            match tx.commit() {
                Ok(ts) => return Ok(ts),
                Err(e) if e.is_retryable() => continue,
                Err(e) => return Err(e),
            }
        }
    }

    /// The timestamp of the most recent commit visible to a new transaction.
    ///
    /// Returns [`Timestamp::ZERO`] for a database that has never been written.
    /// This is the read watermark: the timestamp a transaction beginning now
    /// would read at.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// assert_eq!(db.last_committed(), txn_db::Timestamp::ZERO);
    ///
    /// let mut tx = db.begin();
    /// tx.put(b"k".to_vec(), b"v".to_vec());
    /// let ts = tx.commit()?;
    /// assert_eq!(db.last_committed(), ts);
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    #[must_use]
    pub fn last_committed(&self) -> Timestamp {
        self.inner.read_ts()
    }

    /// Reclaim versions that no live transaction or snapshot can observe,
    /// returning how many were removed.
    ///
    /// `txn-db` keeps every version of a key so that an in-flight reader sees a
    /// stable snapshot. Once no live reader can observe an old version — because
    /// every active transaction and snapshot reads at a timestamp newer than a
    /// later version of that key — the old one is unreachable and this reclaims
    /// it. A key deleted before the oldest live reader's snapshot is dropped
    /// entirely.
    ///
    /// Call it periodically, or after retiring long-running snapshots, to bound
    /// memory. It is safe to call at any time and from any thread: a version a
    /// live reader can still reach is never reclaimed. With the default
    /// in-memory store this prunes the version chains; a custom
    /// [`VersionStore`](crate::VersionStore) that keeps no history can leave the
    /// default no-op in place.
    ///
    /// # Examples
    ///
    /// ```
    /// use txn_db::Db;
    ///
    /// let db = Db::new();
    /// // Overwrite the same key several times.
    /// for v in 0..5u8 {
    ///     let mut tx = db.begin();
    ///     tx.put(b"k".to_vec(), vec![v]);
    ///     tx.commit()?;
    /// }
    ///
    /// // No snapshot is held, so only the newest version need be kept.
    /// let reclaimed = db.collect_garbage();
    /// assert!(reclaimed > 0);
    /// assert_eq!(db.begin().get(b"k")?.as_deref(), Some(&[4u8][..]));
    /// # Ok::<(), txn_db::TxnError>(())
    /// ```
    pub fn collect_garbage(&self) -> usize {
        self.inner.collect_garbage()
    }
}

impl<S: VersionStore> Clone for Db<S> {
    /// Clone the handle, not the data: the clone shares the same underlying
    /// database.
    fn clone(&self) -> Self {
        Db {
            inner: Arc::clone(&self.inner),
        }
    }
}

#[cfg(all(test, not(loom)))]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
    use super::*;

    #[test]
    fn test_new_database_is_empty_at_zero() {
        let db = Db::new();
        assert_eq!(db.last_committed(), Timestamp::ZERO);
        assert_eq!(db.begin().get(b"k").unwrap(), None);
    }

    #[test]
    fn test_commit_makes_writes_visible_to_later_transactions() {
        let db = Db::new();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v".to_vec());
        let ts = tx.commit().unwrap();
        assert!(ts > Timestamp::ZERO);
        assert_eq!(db.begin().get(b"k").unwrap().as_deref(), Some(&b"v"[..]));
    }

    #[test]
    fn test_snapshot_is_isolated_from_later_commits() {
        let db = Db::new();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v1".to_vec());
        let _ = tx.commit().unwrap();

        let snap = db.snapshot();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v2".to_vec());
        let _ = tx.commit().unwrap();

        assert_eq!(snap.get(b"k").unwrap().as_deref(), Some(&b"v1"[..]));
    }

    #[test]
    fn test_write_write_conflict_aborts_later_committer() {
        let db = Db::new();
        let mut a = db.begin();
        let mut b = db.begin();
        a.put(b"k".to_vec(), b"a".to_vec());
        b.put(b"k".to_vec(), b"b".to_vec());

        assert!(a.commit().is_ok());
        let err = b.commit().expect_err("second committer must lose");
        assert!(err.is_retryable());
        assert_eq!(db.begin().get(b"k").unwrap().as_deref(), Some(&b"a"[..]));
    }

    #[test]
    fn test_disjoint_keys_do_not_conflict() {
        let db = Db::new();
        let mut a = db.begin();
        let mut b = db.begin();
        a.put(b"a".to_vec(), b"1".to_vec());
        b.put(b"b".to_vec(), b"2".to_vec());
        assert!(a.commit().is_ok());
        assert!(b.commit().is_ok());
    }

    #[test]
    fn test_read_only_commit_returns_snapshot_timestamp() {
        let db = Db::new();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v".to_vec());
        let ts = tx.commit().unwrap();

        let ro = db.begin();
        assert_eq!(ro.commit().unwrap(), ts);
    }

    #[test]
    fn test_rollback_discards_writes() {
        let db = Db::new();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v".to_vec());
        tx.rollback();
        assert_eq!(db.begin().get(b"k").unwrap(), None);
    }

    #[test]
    fn test_autocommit_put_get_delete() {
        let db = Db::new();
        let ts = db.put(b"k".to_vec(), b"v".to_vec()).unwrap();
        assert!(ts > Timestamp::ZERO);
        assert_eq!(db.get(b"k").unwrap().as_deref(), Some(&b"v"[..]));
        assert_eq!(db.get(b"absent").unwrap(), None);

        let ts2 = db.delete(b"k".to_vec()).unwrap();
        assert!(ts2 > ts);
        assert_eq!(db.get(b"k").unwrap(), None);
    }

    #[test]
    fn test_autocommit_put_is_last_writer_wins_under_contention() {
        use std::thread;
        let db = Db::new();
        let handles: Vec<_> = (0..8u8)
            .map(|t| {
                let db = db.clone();
                thread::spawn(move || {
                    for _ in 0..100 {
                        // All threads write the same hot key; autocommit retries
                        // internally, so none of these ever fail.
                        let _ = db.put(b"hot".to_vec(), vec![t]).unwrap();
                    }
                })
            })
            .collect();
        for h in handles {
            h.join().unwrap();
        }
        // The key exists and holds one of the written values.
        let v = db.get(b"hot").unwrap().unwrap();
        assert!(v.len() == 1 && v[0] < 8);
    }

    #[test]
    fn test_gc_reclaims_when_no_reader_is_held() {
        let db = Db::new();
        for v in 0..5u8 {
            let mut tx = db.begin();
            tx.put(b"k".to_vec(), vec![v]);
            let _ = tx.commit().unwrap();
        }
        let reclaimed = db.collect_garbage();
        assert!(reclaimed > 0);
        assert_eq!(db.begin().get(b"k").unwrap().as_deref(), Some(&[4u8][..]));
    }

    #[test]
    fn test_held_snapshot_pins_gc() {
        let db = Db::new();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), vec![1]);
        let _ = tx.commit().unwrap();

        // Hold a snapshot of this state, then overwrite the key.
        let snap = db.snapshot();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), vec![2]);
        let _ = tx.commit().unwrap();

        // GC must not reclaim the version the held snapshot still observes.
        let _ = db.collect_garbage();
        assert_eq!(snap.get(b"k").unwrap().as_deref(), Some(&[1u8][..]));

        // Once the snapshot is dropped, the old version becomes reclaimable.
        drop(snap);
        let reclaimed = db.collect_garbage();
        assert!(reclaimed > 0);
        assert_eq!(db.begin().get(b"k").unwrap().as_deref(), Some(&[2u8][..]));
    }

    #[test]
    fn test_clone_shares_state() {
        let db = Db::new();
        let db2 = db.clone();
        let mut tx = db.begin();
        tx.put(b"k".to_vec(), b"v".to_vec());
        let _ = tx.commit().unwrap();
        assert_eq!(db2.begin().get(b"k").unwrap().as_deref(), Some(&b"v"[..]));
    }

    #[cfg(feature = "serializable")]
    #[test]
    fn test_serializable_rejects_write_skew() {
        let db = Db::new();
        let mut seed = db.begin();
        seed.put(b"x".to_vec(), vec![1]);
        seed.put(b"y".to_vec(), vec![1]);
        let _ = seed.commit().unwrap();

        // Two serializable transactions from the same snapshot each read both
        // rows and write the one the other read.
        let mut t1 = db.begin_serializable();
        let mut t2 = db.begin_serializable();
        let _ = t1.get(b"x").unwrap();
        let _ = t1.get(b"y").unwrap();
        let _ = t2.get(b"x").unwrap();
        let _ = t2.get(b"y").unwrap();
        t1.put(b"x".to_vec(), vec![0]);
        t2.put(b"y".to_vec(), vec![0]);

        assert!(t1.commit().is_ok());
        // t2 read x, which t1 changed -> serializable validation aborts it.
        let err = t2.commit().expect_err("write skew must be rejected");
        assert!(err.is_retryable());
    }

    #[cfg(feature = "serializable")]
    #[test]
    fn test_snapshot_txn_allows_write_skew() {
        let db = Db::new();
        let mut seed = db.begin();
        seed.put(b"x".to_vec(), vec![1]);
        seed.put(b"y".to_vec(), vec![1]);
        let _ = seed.commit().unwrap();

        // The same schedule under plain snapshot isolation: both commit, because
        // SI does not validate the read set.
        let mut t1 = db.begin();
        let mut t2 = db.begin();
        let _ = t1.get(b"x").unwrap();
        let _ = t1.get(b"y").unwrap();
        let _ = t2.get(b"x").unwrap();
        let _ = t2.get(b"y").unwrap();
        t1.put(b"x".to_vec(), vec![0]);
        t2.put(b"y".to_vec(), vec![0]);

        assert!(t1.commit().is_ok());
        assert!(t2.commit().is_ok());
    }
}