slatedb 0.9.2 - Docs.rs

use bytes::Bytes;
use parking_lot::{Mutex, RwLock};
use std::collections::HashSet;
use std::ops::RangeBounds;
use std::sync::Arc;
use uuid::Uuid;

use crate::batch::WriteBatch;
use crate::bytes_range::BytesRange;
use crate::config::{MergeOptions, PutOptions, ReadOptions, ScanOptions, WriteOptions};
use crate::db::DbInner;
use crate::db_iter::{DbIterator, DbIteratorRangeTracker};
use crate::transaction_manager::{IsolationLevel, TransactionManager};
use crate::DbRead;

/// A database transaction that provides atomic read-write operations with
/// configurable isolation levels. This is the main interface for transactional
/// operations in SlateDB.
///
/// # Examples
///
/// Basic transaction usage:
/// ```rust
/// # async fn run() -> Result<(), slatedb::Error> {
/// #     use std::sync::Arc;
/// #     use slatedb::object_store::memory::InMemory;
/// use slatedb::{Db, IsolationLevel};
///
/// #     let object_store = Arc::new(InMemory::new());
/// #     let db = Db::open("path/to/db", object_store).await?;
/// let txn = db.begin(IsolationLevel::Snapshot).await?;
///
/// // Read operations
/// let value = txn.get(b"key").await?;
///
/// // Write operations
/// txn.put(b"key", b"value")?;
/// txn.delete(b"key")?;
///
/// // Commit the transaction
/// txn.commit().await?;
/// # Ok(())
/// # };
/// ```
pub struct DBTransaction {
    /// Transaction ID generated by the transaction manager
    txn_id: Uuid,
    /// Sequence number when the transaction started
    started_seq: u64,
    /// Reference to the transaction manager
    txn_manager: Arc<TransactionManager>,
    /// The write batch of the transaction, which contains the uncommitted writes.
    /// Users can read data from the write batch during the transaction, thus providing
    /// an MVCC view of the database.
    ///
    /// DBTransaction is not intended for concurrent use; we use `RwLock` (not `RefCell`) for
    /// interior mutability to preserve `Sync` in async contexts. `RefCell` is `!Sync` and would
    /// make `DBTransaction` `!Sync`, which is incompatible with async code using the `DbRead`
    /// trait.
    write_batch: RwLock<WriteBatch>,
    /// Reference to the database
    db_inner: Arc<DbInner>,
    /// Isolation level for this transaction
    isolation_level: IsolationLevel,
    /// Range trackers for scanned ranges (used for SSI conflict detection)
    range_trackers: Mutex<Vec<Arc<DbIteratorRangeTracker>>>,
}

impl DBTransaction {
    #[allow(unused)]
    pub(crate) fn new(
        db_inner: Arc<DbInner>,
        txn_manager: Arc<TransactionManager>,
        seq: u64,
        isolation_level: IsolationLevel,
    ) -> Self {
        let txn_id = txn_manager.new_txn(seq, false); // false = not read-only

        Self {
            txn_id,
            started_seq: seq,
            txn_manager,
            write_batch: RwLock::new(WriteBatch::new().with_txn_id(txn_id)),
            db_inner,
            isolation_level,
            range_trackers: Mutex::new(Vec::new()),
        }
    }

    /// Get a value from the transaction with default read options.
    /// This operation will track the read for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `key`: the key to get
    ///
    /// ## Returns
    /// - `Result<Option<Bytes>, SlateDBError>`: the value if it exists, None otherwise
    pub async fn get<K: AsRef<[u8]> + Send>(&self, key: K) -> Result<Option<Bytes>, crate::Error> {
        self.get_with_options(key, &ReadOptions::default()).await
    }

    /// Get a value from the transaction with custom read options.
    /// This operation will track the read for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `key`: the key to get
    /// - `options`: the read options to use
    ///
    /// ## Returns
    /// - `Result<Option<Bytes>, SlateDBError>`: the value if it exists, None otherwise
    pub async fn get_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<Bytes>, crate::Error> {
        self.db_inner.check_closed()?;

        // Track read key for SSI conflict detection if needed
        if self.isolation_level == IsolationLevel::SerializableSnapshot {
            let key_bytes = Bytes::copy_from_slice(key.as_ref());
            let mut read_keys = HashSet::new();
            read_keys.insert(key_bytes);
            self.txn_manager.track_read_keys(&self.txn_id, &read_keys);
        }

        let db_state = self.db_inner.state.read().view();

        // Clone the WriteBatch for snapshot isolation
        let write_batch_cloned = self.write_batch.read().clone();

        // For now, delegate to the underlying reader
        self.db_inner
            .reader
            .get_with_options(
                key,
                options,
                &db_state,
                Some(write_batch_cloned),
                Some(self.started_seq),
            )
            .await
            .map_err(Into::into)
    }

    /// Scan a range of keys using the default scan options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `range`: the range of keys to scan
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan<K, T>(&self, range: T) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        self.scan_with_options(range, &ScanOptions::default()).await
    }

    /// Scan a range of keys with the provided options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `range`: the range of keys to scan
    /// - `options`: the scan options to use
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan_with_options<K, T>(
        &self,
        range: T,
        options: &ScanOptions,
    ) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        // TODO: this range conversion logic can be extract to an util
        let start = range
            .start_bound()
            .map(|b| Bytes::copy_from_slice(b.as_ref()));
        let end = range
            .end_bound()
            .map(|b| Bytes::copy_from_slice(b.as_ref()));
        let range = (start, end);

        // Track read range for SSI conflict detection if needed
        let range_tracker = if self.isolation_level == IsolationLevel::SerializableSnapshot {
            let tracker = Arc::new(DbIteratorRangeTracker::new());
            self.range_trackers.lock().push(tracker.clone());
            Some(tracker)
        } else {
            None
        };

        self.db_inner.check_closed()?;
        let db_state = self.db_inner.state.read().view();

        // Clone the WriteBatch for the scan to ensure that the scan within a transaction
        // sees a consistent view of the current writes.
        let write_batch_cloned = self.write_batch.read().clone();

        // For now, delegate to the underlying reader
        self.db_inner
            .reader
            .scan_with_options(
                BytesRange::from(range),
                options,
                &db_state,
                Some(write_batch_cloned),
                Some(self.started_seq),
                range_tracker,
            )
            .await
            .map_err(Into::into)
    }

    /// Put a key-value pair into the transaction.
    /// The write will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to write
    /// - `value`: the value to write
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the write operation is
    ///   buffered in the write batch.
    pub fn put<K, V>(&self, key: K, value: V) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.put_with_options(key, value, &PutOptions::default())
    }

    /// Put a key-value pair into the transaction with custom options.
    /// The write will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to write
    /// - `value`: the value to write
    /// - `options`: the put options to use
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the write operation is
    ///   buffered in the write batch.
    pub fn put_with_options<K, V>(
        &self,
        key: K,
        value: V,
        options: &PutOptions,
    ) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.write_batch
            .write()
            .put_with_options(key, value, options);
        Ok(())
    }

    /// Merge a key-value pair into the transaction.
    pub fn merge<K, V>(&self, key: K, value: V) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.merge_with_options(key, value, &MergeOptions::default())
    }

    /// Merge a key-value pair into the transaction with custom options.
    pub fn merge_with_options<K, V>(
        &self,
        key: K,
        value: V,
        options: &MergeOptions,
    ) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.write_batch
            .write()
            .merge_with_options(key, value, options);
        Ok(())
    }

    /// Delete a key from the transaction.
    /// The delete will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to delete
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the delete operation is
    ///   buffered in the write batch.
    pub fn delete<K: AsRef<[u8]>>(&self, key: K) -> Result<(), crate::Error> {
        self.write_batch.write().delete(key);
        Ok(())
    }

    /// Commit the transaction by applying all buffered operations to the database.
    ///
    /// This method finalizes the transaction by writing all pending puts, deletes, and other
    /// operations from the write batch to persistent storage. The actual conflict detection
    /// (including read-write and write-write conflicts) is deferred to the task that processes
    /// the WriteBatch, which ensures the atomicity of transactions.
    ///
    /// If the transaction's write batch is empty, this operation is a no-op and returns `Ok(())`
    /// immediately without any database interaction. Since it's impossible to have read-write
    /// conflict, neither write-write conflict for an empty write batch.
    ///
    /// ## Errors
    /// - Returns `Error` if the commit operation fails, which could be due to:
    ///   - Database I/O errors
    ///   - Concurrency conflicts detected during WriteBatch processing
    pub async fn commit(self) -> Result<(), crate::Error> {
        self.commit_with_options(&WriteOptions::default()).await
    }

    /// Commit the transaction with custom write options.
    ///
    /// This method behaves the same as [`DBTransaction::commit`], but allows callers
    /// to specify custom [`WriteOptions`], such as `await_durable`.
    ///
    /// ## Arguments
    /// - `options`: the write options to use for the commit
    ///
    /// ## Errors
    /// - Returns `Error` if the commit operation fails, which could be due to:
    ///   - Database I/O errors
    ///   - Concurrency conflicts detected during WriteBatch processing
    pub async fn commit_with_options(self, options: &WriteOptions) -> Result<(), crate::Error> {
        // If the WriteBatch is empty, it's a no-op. it's impossible to have read-write
        // conflict, neither write-write conflict.
        if self.write_batch.read().is_empty() {
            return Ok(());
        }

        // Extract actual scanned ranges from trackers for SSI conflict detection
        if self.isolation_level == IsolationLevel::SerializableSnapshot {
            for tracker in self.range_trackers.lock().iter() {
                if tracker.has_data() {
                    if let Some(range) = tracker.get_range() {
                        self.txn_manager.track_read_range(&self.txn_id, range);
                    }
                }
            }
        }

        // Take the write_batch for submission to the database.
        let write_batch = self.write_batch.read().clone();

        // Track the write keys from write batch
        let write_keys = write_batch.keys();
        self.txn_manager.track_write_keys(&self.txn_id, &write_keys);

        // Submit the WriteBatch to the database for processing. The batch is sent to a
        // dedicated background task (in batch_write.rs) that processes all WriteBatches
        // sequentially, ensuring no concurrent writes. Both conflict checking & persisting
        // are handled there.
        self.db_inner
            .write_with_options(write_batch, options)
            .await
            .map_err(Into::into)
    }

    /// Rollback the transaction by discarding all buffered operations.
    /// This is automatically called when the transaction is dropped.
    pub fn rollback(self) {
        // do nothing, trigger the Drop of the transaction
    }
}

#[async_trait::async_trait]
impl DbRead for DBTransaction {
    async fn get_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<Bytes>, crate::Error> {
        self.get_with_options(key, options).await
    }

    async fn scan_with_options<K, T>(
        &self,
        range: T,
        options: &ScanOptions,
    ) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        self.scan_with_options(range, options).await
    }
}

/// Unregister from transaction manager when dropped.
/// If the transaction hasn't been committed, it's considered rolled back.
impl Drop for DBTransaction {
    fn drop(&mut self) {
        self.txn_manager.drop_txn(&self.txn_id);
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::object_store::memory::InMemory;
    use rstest::rstest;
    use std::sync::Arc;

    #[tokio::test]
    async fn test_txn_basic_visibility() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin transaction
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();

        // Put data from others
        db.put(b"k2", b"v2").await.unwrap();

        // Read within transaction - should see the initial data
        let value = txn.get(b"k1").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v1")));

        // Commit transaction
        txn.commit().await.unwrap();
    }

    #[tokio::test]
    async fn test_txn_write_visibility_in_txn() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin transaction
        let txn = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Write within transaction
        txn.put(b"k1", b"v2").unwrap();

        // Read within transaction - should see the updated value in the transaction
        let value = txn.get(b"k1").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v2")));

        // Commit transaction
        txn.commit().await.unwrap();
    }

    #[tokio::test]
    async fn test_txn_si_commit_conflict() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin first transaction
        let txn1 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn1.put(b"k1", b"v2").unwrap();

        // Begin second transaction
        let txn2 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn2.put(b"k1", b"v3").unwrap();

        // Commit first transaction - should succeed
        txn1.commit().await.unwrap();

        // Commit second transaction - should fail due to conflict
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_si_commit_conflict_with_db_writes() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin first transaction
        let txn1 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn1.put(b"k1", b"v2").unwrap();

        // DB put on the same key
        db.put(b"k1", b"v3").await.unwrap();

        // Commit transaction - should conflict
        let result = txn1.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_ssi_commit_conflict() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2.1").await.unwrap();

        // Begin first transaction
        let txn1 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        txn1.put(b"k1", b"v2").unwrap();
        txn1.put(b"k2", b"v2.2").unwrap();

        // Begin second transaction
        let txn2 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        let val2 = txn2.get(b"k2").await.unwrap();
        assert_eq!(val2, Some(Bytes::from_static(b"v2.1")));
        txn2.put(b"k3", b"v3").unwrap();

        // Commit first transaction - should succeed
        txn1.commit().await.unwrap();

        // Commit second transaction - should fail due to conflict for reading k2
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_ssi_commit_conflit_with_ranges() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2.1").await.unwrap();
        db.put(b"k3", b"v3").await.unwrap();

        // Begin first transaction
        let txn1 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Begin second transaction
        let txn2 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Transaction 2 scans k2..k3
        {
            let mut iter = txn2.scan(&b"k2"[..]..=&b"k3"[..]).await.unwrap();
            while let Some(_kv) = iter.next().await.unwrap() {
                // Just iterate through the range to track it
            }
        }

        // Transaction 1 writes within the range that transaction 2 scanned
        txn1.put(b"k2", b"v2.2").unwrap();
        txn1.commit().await.unwrap();

        // Transaction 2 tries to write something
        txn2.put(b"k4", b"v4").unwrap();

        // Commit second transaction - should fail due to phantom conflict
        // because it read a range that was modified by transaction 1
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
    async fn test_txn_commit_await_durable_false() {
        use crate::config::{DurabilityLevel::*, ReadOptions, WriteOptions};
        use fail_parallel::FailPointRegistry;

        // Setup database with failpoints to pause durable writes
        let fp_registry = Arc::new(FailPointRegistry::new());
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::builder("/tmp/test_txn_commit_await_durable_false", object_store)
            .with_fp_registry(fp_registry.clone())
            .build()
            .await
            .unwrap();

        // Pause durable writes to object storage
        fail_parallel::cfg(fp_registry.clone(), "write-wal-sst-io-error", "pause").unwrap();

        // Begin a transaction and write a key
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.put(b"k", b"v").unwrap();

        // Commit without waiting for durability
        txn.commit_with_options(&WriteOptions {
            await_durable: false,
        })
        .await
        .unwrap();

        // Memory (in-memory) read should see the value
        let val = db
            .get_with_options(b"k", &ReadOptions::new().with_durability_filter(Memory))
            .await
            .unwrap();
        assert_eq!(val, Some(Bytes::from_static(b"v")));

        // Remote (durable) read should not see the value yet
        let val = db
            .get_with_options(b"k", &ReadOptions::new().with_durability_filter(Remote))
            .await
            .unwrap();
        assert_eq!(val, None);

        // Clean up
        fail_parallel::cfg(fp_registry.clone(), "write-wal-sst-io-error", "off").unwrap();
        db.close().await.unwrap();
    }

    // Transaction test structures for table-driven tests
    #[derive(Debug, Clone)]
    struct TransactionTestCase {
        name: &'static str,
        isolation_level: IsolationLevel,
        initial_data: Vec<(Bytes, Bytes)>,
        operations: Vec<TransactionTestOp>,
        expected_results: Vec<TransactionTestOpResult>,
    }

    #[derive(Debug, Clone)]
    #[allow(dead_code)]
    enum TransactionTestOp {
        TxnGet(Bytes),
        TxnScan(Bytes, Bytes),
        TxnPut(Bytes, Bytes),
        TxnDelete(Bytes),
        TxnCommit,
        TxnRollback,
        DbPut(Bytes, Bytes),
        DbGet(Bytes),
    }

    #[derive(Debug, Clone, PartialEq)]
    enum TransactionTestOpResult {
        Got(Option<Bytes>),
        Scanned(Vec<Bytes>),
        Empty,
        Conflicted,
        Invalid,
    }

    async fn execute_transaction_test_ops(
        db: crate::Db,
        operations: Vec<TransactionTestOp>,
        initial_data: Vec<(Bytes, Bytes)>,
        isolation_level: IsolationLevel,
    ) -> Vec<TransactionTestOpResult> {
        // Setup initial data
        for (key, value) in initial_data {
            db.put(key, value).await.unwrap();
        }

        let mut txn_opt = Some(db.begin(isolation_level).await.unwrap());

        let mut results = Vec::new();
        for operation in operations.iter() {
            let result = match (txn_opt.as_mut(), operation) {
                // Transaction operations with active transaction
                (Some(txn), TransactionTestOp::TxnGet(key)) => {
                    let val = txn.get(key).await.unwrap();
                    TransactionTestOpResult::Got(val)
                }
                (Some(txn), TransactionTestOp::TxnScan(start, end)) => {
                    let mut iter = txn.scan(&start[..]..=&end[..]).await.unwrap();
                    let mut scanned_keys = Vec::new();
                    while let Some(kv) = iter.next().await.unwrap() {
                        scanned_keys.push(kv.key);
                    }
                    TransactionTestOpResult::Scanned(scanned_keys)
                }
                (Some(txn), TransactionTestOp::TxnPut(key, value)) => {
                    txn.put(key, value).unwrap();
                    TransactionTestOpResult::Empty
                }
                (Some(txn), TransactionTestOp::TxnDelete(key)) => {
                    txn.delete(key).unwrap();
                    TransactionTestOpResult::Empty
                }
                (Some(_txn), TransactionTestOp::TxnCommit) => {
                    let txn = txn_opt.take().unwrap();
                    match txn.commit().await {
                        Ok(_) => TransactionTestOpResult::Empty,
                        Err(_) => TransactionTestOpResult::Conflicted,
                    }
                }
                (Some(_txn), TransactionTestOp::TxnRollback) => {
                    let txn = txn_opt.take().unwrap();
                    txn.rollback();
                    TransactionTestOpResult::Empty
                }

                // Database operations
                (_, TransactionTestOp::DbPut(key, value)) => {
                    db.put(key, value).await.unwrap();
                    TransactionTestOpResult::Empty
                }
                (_, TransactionTestOp::DbGet(key)) => {
                    let val = db.get(key).await.unwrap();
                    TransactionTestOpResult::Got(val)
                }

                // Invalid operations (transaction operations without active transaction)
                (None, TransactionTestOp::TxnGet(_))
                | (None, TransactionTestOp::TxnScan(_, _))
                | (None, TransactionTestOp::TxnPut(_, _))
                | (None, TransactionTestOp::TxnDelete(_))
                | (None, TransactionTestOp::TxnCommit)
                | (None, TransactionTestOp::TxnRollback) => TransactionTestOpResult::Invalid,
            };

            results.push(result);
        }

        results
    }

    // Table-driven tests using rstest
    #[rstest]
    #[case::ssi_basic_visibility(
        TransactionTestCase {
            name: "ssi_basic_visibility",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_visibility_in_txn(
        TransactionTestCase {
            name: "ssi_write_visibility_in_txn",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v2"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_delete_visibility_in_txn(
        TransactionTestCase {
            name: "ssi_delete_visibility_in_txn",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnDelete(Bytes::from("k1")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(None),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_rollback_visibility(
        TransactionTestCase {
            name: "ssi_rollback_visibility",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnRollback,
                TransactionTestOp::DbGet(Bytes::from("k1")),
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
            ]
        }
    )]
    #[case::si_concurrent_read_snapshot(
        TransactionTestCase {
            name: "ssi_concurrent_read_snapshot",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_write_conflict(
        TransactionTestCase {
            name: "ssi_write_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v3")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_read_write_conflict(
        TransactionTestCase {
            name: "ssi_read_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v1")),
                TransactionTestOp::TxnPut(Bytes::from("k2"), Bytes::from("v2.1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_read_write_no_conflict(
        TransactionTestCase {
            name: "si_read_write_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_read_conflict(
        TransactionTestCase {
            name: "ssi_write_read_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnPut(Bytes::from("k3"), Bytes::from("v3")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_write_read_no_conflict(
        TransactionTestCase {
            name: "si_write_read_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_range_write_conflict(
        TransactionTestCase {
            name: "ssi_range_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3")),
                (Bytes::from("k4"), Bytes::from("v4")),
                (Bytes::from("k5"), Bytes::from("v5"))
            ],
            operations: vec![
                TransactionTestOp::TxnScan(Bytes::from("k1"), Bytes::from("k5")),
                TransactionTestOp::DbPut(Bytes::from("k3"), Bytes::from("v3_new")),
                TransactionTestOp::TxnPut(Bytes::from("k100"), Bytes::from("v100")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Scanned(vec![Bytes::from("k1"), Bytes::from("k2"), Bytes::from("k3"), Bytes::from("k4"), Bytes::from("k5")]),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_range_write_no_conflict(
        TransactionTestCase {
            name: "si_range_write_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3")),
                (Bytes::from("k4"), Bytes::from("v4")),
                (Bytes::from("k5"), Bytes::from("v5"))
            ],
            operations: vec![
                TransactionTestOp::TxnScan(Bytes::from("k1"), Bytes::from("k5")),
                TransactionTestOp::DbPut(Bytes::from("k3"), Bytes::from("v3_new")),
                TransactionTestOp::TxnPut(Bytes::from("k100"), Bytes::from("v100")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Scanned(vec![Bytes::from("k1"), Bytes::from("k2"), Bytes::from("k3"), Bytes::from("k4"), Bytes::from("k5")]),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[tokio::test]
    async fn test_txn_table_driven(#[case] test_case: TransactionTestCase) {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open(test_case.name, object_store).await.unwrap();

        let initial_data_bytes: Vec<(Bytes, Bytes)> = test_case.initial_data.clone();

        let results = execute_transaction_test_ops(
            db,
            test_case.operations,
            initial_data_bytes,
            test_case.isolation_level,
        )
        .await;

        for (i, (result, expected)) in results
            .iter()
            .zip(test_case.expected_results.iter())
            .enumerate()
        {
            assert_eq!(
                result, expected,
                "Test '{}' failed at operation {}: expected {:?}, got {:?}",
                test_case.name, i, expected, result
            );
        }
    }

    #[tokio::test]
    async fn test_txn_scan_sees_concurrent_put_in_same_txn() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data: k1 and k3
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k3", b"v3").await.unwrap();

        // Begin transaction
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();

        // Test 1: Scan created before put should NOT see the new key or updated value for an existing key
        {
            // Start a scan from k1 to k3 (inclusive)
            let mut iter = txn.scan(&b"k1"[..]..=&b"k3"[..]).await.unwrap();

            // Put k2 in the transaction (after scan has started)
            txn.put(b"k2", b"v2").unwrap();
            // Update k3 within the transaction after the scan has started
            txn.put(b"k3", b"v3_updated").unwrap();

            // Iterate through the results
            let mut results = Vec::new();
            while let Some(kv) = iter.next().await.unwrap() {
                results.push((kv.key.clone(), kv.value.clone()));
            }

            // The iterator should see k1 and k3 (the snapshot at scan time)
            // It should NOT see k2 because the scan was created before k2 was put
            assert_eq!(results.len(), 2);
            assert_eq!(results[0].0, Bytes::from_static(b"k1"));
            assert_eq!(results[0].1, Bytes::from_static(b"v1"));
            assert_eq!(results[1].0, Bytes::from_static(b"k3"));
            assert_eq!(results[1].1, Bytes::from_static(b"v3"));
        } // iter is dropped here

        // Test 2: A new scan after the put should see k2
        {
            let mut iter2 = txn.scan(&b"k1"[..]..=&b"k3"[..]).await.unwrap();
            let mut results2 = Vec::new();
            while let Some(kv) = iter2.next().await.unwrap() {
                results2.push((kv.key.clone(), kv.value.clone()));
            }

            // This new scan should see all three keys and the updated value for k3
            assert_eq!(results2.len(), 3);
            assert_eq!(results2[0].0, Bytes::from_static(b"k1"));
            assert_eq!(results2[1].0, Bytes::from_static(b"k2"));
            assert_eq!(results2[1].1, Bytes::from_static(b"v2"));
            assert_eq!(results2[2].0, Bytes::from_static(b"k3"));
            assert_eq!(results2[2].1, Bytes::from_static(b"v3_updated"));
        } // iter2 is dropped here

        // Commit the transaction
        txn.commit().await.unwrap();

        // Verify k2 is now in the database
        let value = db.get(b"k2").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v2")));
    }
}