slatedb 0.12.1 - Docs.rs

use bytes::Bytes;
use parking_lot::{Mutex, RwLock};
use std::collections::HashSet;
use std::ops::RangeBounds;
use std::sync::Arc;
use uuid::Uuid;

use crate::batch::WriteBatch;
use crate::bytes_range::BytesRange;
use crate::config::{MergeOptions, PutOptions, ReadOptions, ScanOptions, WriteOptions};
use crate::db::DbInner;
use crate::db::WriteHandle;
use crate::db_iter::{DbIterator, DbIteratorRangeTracker};
use crate::error::SlateDBError;
use crate::transaction_manager::{IsolationLevel, TransactionManager};
use crate::types::KeyValue;
use crate::DbRead;

/// A database transaction that provides atomic read-write operations with
/// configurable isolation levels. This is the main interface for transactional
/// operations in SlateDB.
///
/// # Examples
///
/// Basic transaction usage:
/// ```rust
/// # async fn run() -> Result<(), slatedb::Error> {
/// #     use std::sync::Arc;
/// #     use slatedb::object_store::memory::InMemory;
/// use slatedb::{Db, IsolationLevel};
///
/// #     let object_store = Arc::new(InMemory::new());
/// #     let db = Db::open("path/to/db", object_store).await?;
/// let txn = db.begin(IsolationLevel::Snapshot).await?;
///
/// // Read operations
/// let value = txn.get(b"key").await?;
///
/// // Write operations
/// txn.put(b"key", b"value")?;
/// txn.delete(b"key")?;
///
/// // Commit the transaction
/// txn.commit().await?;
/// # Ok(())
/// # };
/// ```
pub struct DbTransaction {
    /// Transaction ID generated by the transaction manager
    txn_id: Uuid,
    /// Sequence number when the transaction started
    started_seq: u64,
    /// Reference to the transaction manager
    txn_manager: Arc<TransactionManager>,
    /// The write batch of the transaction, which contains the uncommitted writes.
    /// Users can read data from the write batch during the transaction, thus providing
    /// an MVCC view of the database.
    ///
    /// DbTransaction is not intended for concurrent use; we use `RwLock` (not `RefCell`) for
    /// interior mutability to preserve `Sync` in async contexts. `RefCell` is `!Sync` and would
    /// make `DbTransaction` `!Sync`, which is incompatible with async code using the `DbRead`
    /// trait.
    write_batch: RwLock<WriteBatch>,
    /// Reference to the database
    db_inner: Arc<DbInner>,
    /// Isolation level for this transaction
    isolation_level: IsolationLevel,
    /// Range trackers for scanned ranges (used for SSI conflict detection)
    range_trackers: Mutex<Vec<Arc<DbIteratorRangeTracker>>>,
    /// Keys that should be excluded from write conflict detection when committing.
    untracked_write_keys: RwLock<HashSet<Bytes>>,
}

impl DbTransaction {
    #[allow(unused)]
    pub(crate) fn new(
        db_inner: Arc<DbInner>,
        txn_manager: Arc<TransactionManager>,
        isolation_level: IsolationLevel,
    ) -> Self {
        let (txn_id, seq) = txn_manager.new_transaction();

        Self {
            txn_id,
            started_seq: seq,
            txn_manager,
            write_batch: RwLock::new(WriteBatch::new().with_txn_id(txn_id)),
            db_inner,
            isolation_level,
            range_trackers: Mutex::new(Vec::new()),
            untracked_write_keys: RwLock::new(HashSet::new()),
        }
    }

    /// Get a value from the transaction with default read options.
    /// This operation will track the read for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `key`: the key to get
    ///
    /// ## Returns
    /// - `Result<Option<Bytes>, SlateDBError>`: the value if it exists, None otherwise
    pub async fn get<K: AsRef<[u8]> + Send>(&self, key: K) -> Result<Option<Bytes>, crate::Error> {
        self.get_with_options(key, &ReadOptions::default()).await
    }

    /// Get a value from the transaction with custom read options.
    /// This operation will track the read for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `key`: the key to get
    /// - `options`: the read options to use
    ///
    /// ## Returns
    /// - `Result<Option<Bytes>, SlateDBError>`: the value if it exists, None otherwise
    pub async fn get_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<Bytes>, crate::Error> {
        self.get_key_value_with_options(key, options)
            .await
            .map(|kv_opt| kv_opt.map(|kv| kv.value))
    }

    /// Get a key-value pair from the transaction with default read options.
    pub async fn get_key_value<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
    ) -> Result<Option<KeyValue>, crate::Error> {
        self.get_key_value_with_options(key, &ReadOptions::default())
            .await
    }

    /// Get a key-value pair from the transaction with custom read options.
    pub async fn get_key_value_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<KeyValue>, crate::Error> {
        self.db_inner.check_closed()?;

        // Track read key for SSI conflict detection if needed
        if self.isolation_level == IsolationLevel::SerializableSnapshot {
            let key_bytes = Bytes::copy_from_slice(key.as_ref());
            let mut read_keys = HashSet::new();
            read_keys.insert(key_bytes);
            self.txn_manager.track_read_keys(&self.txn_id, read_keys);
        }

        let db_state = self.db_inner.state.read().view();

        // Clone the WriteBatch for snapshot isolation
        let write_batch_cloned = self.write_batch.read().clone();

        // For now, delegate to the underlying reader
        let kv = self
            .db_inner
            .reader
            .get_key_value_with_options(
                key,
                options,
                &db_state,
                Some(write_batch_cloned),
                Some(self.started_seq),
            )
            .await
            .map_err(crate::Error::from)?;
        Ok(kv)
    }

    /// Scan a range of keys using the default scan options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `range`: the range of keys to scan
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan<K, T>(&self, range: T) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        self.scan_with_options(range, &ScanOptions::default()).await
    }

    /// Scan a range of keys with the provided options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `range`: the range of keys to scan
    /// - `options`: the scan options to use
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan_with_options<K, T>(
        &self,
        range: T,
        options: &ScanOptions,
    ) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        // TODO: this range conversion logic can be extract to an util
        let start = range
            .start_bound()
            .map(|b| Bytes::copy_from_slice(b.as_ref()));
        let end = range
            .end_bound()
            .map(|b| Bytes::copy_from_slice(b.as_ref()));
        let range = (start, end);

        // Track read range for SSI conflict detection if needed
        let range_tracker = if self.isolation_level == IsolationLevel::SerializableSnapshot {
            let tracker = Arc::new(DbIteratorRangeTracker::new());
            self.range_trackers.lock().push(tracker.clone());
            Some(tracker)
        } else {
            None
        };

        self.db_inner.check_closed()?;
        let db_state = self.db_inner.state.read().view();

        // Clone the WriteBatch for the scan to ensure that the scan within a transaction
        // sees a consistent view of the current writes.
        let write_batch_cloned = self.write_batch.read().clone();

        // For now, delegate to the underlying reader
        self.db_inner
            .reader
            .scan_with_options(
                BytesRange::from(range),
                options,
                &db_state,
                Some(write_batch_cloned),
                Some(self.started_seq),
                range_tracker,
            )
            .await
            .map_err(Into::into)
    }

    /// Scan all keys that share the provided prefix using the default scan options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `prefix`: the key prefix to scan
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan_prefix<P>(&self, prefix: P) -> Result<DbIterator, crate::Error>
    where
        P: AsRef<[u8]> + Send,
    {
        self.scan_prefix_with_options(prefix, &ScanOptions::default())
            .await
    }

    /// Scan all keys that share the provided prefix with custom options.
    /// This operation will track the read range for conflict detection in SSI mode.
    ///
    /// ## Arguments
    /// - `prefix`: the key prefix to scan
    /// - `options`: the scan options to use
    ///
    /// ## Returns
    /// - `Result<DbIterator, SlateDBError>`: An iterator with the results of the scan
    pub async fn scan_prefix_with_options<P>(
        &self,
        prefix: P,
        options: &ScanOptions,
    ) -> Result<DbIterator, crate::Error>
    where
        P: AsRef<[u8]> + Send,
    {
        self.scan_with_options(BytesRange::from_prefix(prefix.as_ref()), options)
            .await
    }

    /// Put a key-value pair into the transaction.
    /// The write will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to write
    /// - `value`: the value to write
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the write operation is
    ///   buffered in the write batch.
    pub fn put<K, V>(&self, key: K, value: V) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.put_with_options(key, value, &PutOptions::default())
    }

    /// Put a key-value pair into the transaction with custom options.
    /// The write will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to write
    /// - `value`: the value to write
    /// - `options`: the put options to use
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the write operation is
    ///   buffered in the write batch.
    pub fn put_with_options<K, V>(
        &self,
        key: K,
        value: V,
        options: &PutOptions,
    ) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.write_batch
            .write()
            .put_with_options(key, value, options);
        Ok(())
    }

    /// Mark keys as read for conflict detection.
    ///
    /// This method explicitly tracks read operations for conflict detection. When keys are
    /// marked as read, the transaction will detect conflicts if another transaction modifies
    /// any of those keys after this transaction started, regardless of the isolation level.
    ///
    /// This allows for selective read-write conflict detection even in Snapshot Isolation mode,
    /// where reads are not automatically tracked (unlike `get()` which only tracks reads in SSI
    /// mode).
    ///
    /// ## Arguments
    /// - `keys`: an iterator of keys to mark as read
    ///
    /// ## Examples
    /// ```rust
    /// # async fn example() -> Result<(), slatedb::Error> {
    /// # use std::sync::Arc;
    /// # use slatedb::object_store::memory::InMemory;
    /// use slatedb::{Db, IsolationLevel};
    ///
    /// # let object_store = Arc::new(InMemory::new());
    /// # let db = Db::open("test_path", object_store).await?;
    /// let txn = db.begin(IsolationLevel::Snapshot).await?;
    /// txn.mark_read([b"key1", b"key2", b"key3"])?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn mark_read<K, I>(&self, keys: I) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        I: IntoIterator<Item = K>,
    {
        // Always track reads when explicitly marked, regardless of isolation level.
        // The current conflict checking logic always checks the read_keys set
        // even in SI mode. The only difference between SI and SSI is whether
        // the read keys are tracked in the read set or not.

        let read_keys = keys.into_iter().map(|k| Bytes::copy_from_slice(k.as_ref()));
        self.txn_manager.track_read_keys(&self.txn_id, read_keys);
        Ok(())
    }

    /// Mark written keys as untracked for conflict detection.
    ///
    /// Keys marked with this method are still written atomically with the rest of the
    /// transaction, but are excluded from transaction conflict detection on commit for
    /// both this transaction and other transactions.
    ///
    /// This means:
    /// - If another transaction reads a key written with an `unmark_write` by this transaction, that key
    ///   will not cause a read-write conflict.
    /// - If another transaction writes a key written with an `unmark_write` by this transaction, that key
    ///   will not cause a write-write conflict.
    ///
    /// You may call `unmark_write` either before or after writing a key in the transaction.
    /// Once a key is unmarked, it cannot be marked again within the same transaction and
    /// remains unmarked for the duration of this transaction, even if `put`, `merge`, or
    /// `delete` is called on the same key later in the transaction.
    ///
    /// ## Arguments
    /// - `keys`: an iterator of keys to exclude from write conflict tracking
    ///
    /// ## Examples
    /// ```rust
    /// # async fn example() -> Result<(), slatedb::Error> {
    /// # use std::sync::Arc;
    /// # use slatedb::object_store::memory::InMemory;
    /// use slatedb::{Db, IsolationLevel};
    ///
    /// # let object_store = Arc::new(InMemory::new());
    /// # let db = Db::open("test_path", object_store).await?;
    /// let txn = db.begin(IsolationLevel::Snapshot).await?;
    /// txn.put(b"counter", b"1")?;
    /// txn.unmark_write([b"counter"])?;
    /// txn.put(b"counter", b"2")?;
    /// txn.commit().await?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn unmark_write<K, I>(&self, keys: I) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        I: IntoIterator<Item = K>,
    {
        let mut untracked_keys = self.untracked_write_keys.write();
        untracked_keys.extend(keys.into_iter().map(|k| Bytes::copy_from_slice(k.as_ref())));
        Ok(())
    }

    /// Merge a key-value pair into the transaction.
    pub fn merge<K, V>(&self, key: K, value: V) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.merge_with_options(key, value, &MergeOptions::default())
    }

    /// Merge a key-value pair into the transaction with custom options.
    pub fn merge_with_options<K, V>(
        &self,
        key: K,
        value: V,
        options: &MergeOptions,
    ) -> Result<(), crate::Error>
    where
        K: AsRef<[u8]>,
        V: AsRef<[u8]>,
    {
        self.write_batch
            .write()
            .merge_with_options(key, value, options);
        Ok(())
    }

    /// Delete a key from the transaction.
    /// The delete will be buffered in the transaction's write batch until commit.
    ///
    /// ## Arguments
    /// - `key`: the key to delete
    ///
    /// ## Errors
    /// - It's not really possible to have error here, since the delete operation is
    ///   buffered in the write batch.
    pub fn delete<K: AsRef<[u8]>>(&self, key: K) -> Result<(), crate::Error> {
        self.write_batch.write().delete(key);
        Ok(())
    }

    /// Commit the transaction by applying all buffered operations to the database.
    ///
    /// This method finalizes the transaction by writing all pending puts, deletes, and other
    /// operations from the write batch to persistent storage. The actual conflict detection
    /// (including read-write and write-write conflicts) is deferred to the task that processes
    /// the WriteBatch, which ensures the atomicity of transactions.
    ///
    /// If the transaction's write batch is empty, this operation is a no-op and returns `Ok(())`
    /// immediately without any database interaction. Since it's impossible to have read-write
    /// conflict, neither write-write conflict for an empty write batch.
    ///
    /// ## Returns
    /// - `Ok(Some(WriteHandle))` if the commit is successful and there are writes in the batch.
    /// - `Ok(None)` if the commit is successful but the write batch is empty (no-op).
    ///
    /// ## Errors
    /// - Returns `Error` if the commit operation fails, which could be due to:
    ///   - Database I/O errors
    ///   - Concurrency conflicts detected during WriteBatch processing
    pub async fn commit(self) -> Result<Option<WriteHandle>, crate::Error> {
        self.commit_with_options(&WriteOptions::default()).await
    }

    /// Commit the transaction with custom write options.
    ///
    /// This method behaves the same as [`DbTransaction::commit`], but allows callers
    /// to specify custom [`WriteOptions`], such as `await_durable`.
    ///
    /// ## Arguments
    /// - `options`: the write options to use for the commit
    ///
    /// ## Returns
    /// - `Ok(Some(WriteHandle))` if the commit is successful and there are writes in the batch.
    /// - `Ok(None)` if the commit is successful but the write batch is empty (no-op).
    ///
    /// ## Errors
    /// - Returns `Error` if the commit operation fails, which could be due to:
    ///   - Database I/O errors
    ///   - Concurrency conflicts detected during WriteBatch processing
    pub async fn commit_with_options(
        self,
        options: &WriteOptions,
    ) -> Result<Option<WriteHandle>, crate::Error> {
        // Take the write_batch for submission to the database.
        let write_batch = self.write_batch.read().clone();

        // Extract actual scanned ranges from trackers for SSI conflict detection
        if self.isolation_level == IsolationLevel::SerializableSnapshot {
            for tracker in self.range_trackers.lock().iter() {
                if tracker.has_data() {
                    if let Some(range) = tracker.get_range() {
                        self.txn_manager.track_read_range(&self.txn_id, range);
                    }
                }
            }
        }

        // If the WriteBatch is empty, it's a no-op or read-only batch.
        if write_batch.is_empty() {
            // Check for read conflicts before returning Ok(None).
            if let Some(txn_id) = write_batch.txn_id.as_ref() {
                if self.txn_manager.check_has_conflict(txn_id) {
                    return Err(SlateDBError::TransactionConflict.into());
                }
            }
            return Ok(None);
        }

        // Track only write keys that were not explicitly unmarked.
        let tracked_write_keys = {
            let untracked_write_keys = self.untracked_write_keys.read();
            write_batch
                .keys()
                .into_iter()
                .filter(|key| !untracked_write_keys.contains(key))
                .collect()
        };
        self.txn_manager
            .track_write_keys(&self.txn_id, &tracked_write_keys);

        // Submit the WriteBatch to the database for processing. The batch is sent to a
        // dedicated background task (in batch_write.rs) that processes all WriteBatches
        // sequentially, ensuring no concurrent writes. Both conflict checking & persisting
        // are handled there.
        self.db_inner
            .write_with_options(write_batch, options)
            .await
            .map(Some)
            .map_err(Into::into)
    }

    /// Rollback the transaction by discarding all buffered operations.
    /// This is automatically called when the transaction is dropped.
    pub fn rollback(self) {
        // do nothing, trigger the Drop of the transaction
    }

    /// Get the sequence number this transaction was started at. This is equivalent to
    /// the snapshot sequence number for this transaction, which determines data visibility
    /// for reads in this transaction.
    pub fn seqnum(&self) -> u64 {
        self.started_seq
    }

    /// Get the transaction ID. This is a unique identifier for this transaction, generated
    /// by the transaction manager.
    pub fn id(&self) -> Uuid {
        self.txn_id
    }
}

#[async_trait::async_trait]
impl DbRead for DbTransaction {
    async fn get_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<Bytes>, crate::Error> {
        self.get_with_options(key, options).await
    }

    async fn get_key_value_with_options<K: AsRef<[u8]> + Send>(
        &self,
        key: K,
        options: &ReadOptions,
    ) -> Result<Option<KeyValue>, crate::Error> {
        self.get_key_value_with_options(key, options).await
    }

    async fn scan_with_options<K, T>(
        &self,
        range: T,
        options: &ScanOptions,
    ) -> Result<DbIterator, crate::Error>
    where
        K: AsRef<[u8]> + Send,
        T: RangeBounds<K> + Send,
    {
        self.scan_with_options(range, options).await
    }
}

/// Unregister from transaction manager when dropped.
/// If the transaction hasn't been committed, it's considered rolled back.
impl Drop for DbTransaction {
    fn drop(&mut self) {
        self.txn_manager.drop_txn(&self.txn_id);
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::merge_operator::{MergeOperator, MergeOperatorError};
    use crate::object_store::memory::InMemory;
    use rstest::rstest;
    use std::sync::Arc;

    struct CounterMergeOperator;

    impl MergeOperator for CounterMergeOperator {
        fn merge(
            &self,
            _key: &Bytes,
            existing_value: Option<Bytes>,
            value: Bytes,
        ) -> Result<Bytes, MergeOperatorError> {
            let existing = existing_value
                .map(|v| u64::from_le_bytes(v.as_ref().try_into().unwrap()))
                .unwrap_or(0);
            let operand = u64::from_le_bytes(value.as_ref().try_into().unwrap());
            Ok(Bytes::copy_from_slice(&(existing + operand).to_le_bytes()))
        }

        fn merge_batch(
            &self,
            _key: &Bytes,
            existing_value: Option<Bytes>,
            operands: &[Bytes],
        ) -> Result<Bytes, MergeOperatorError> {
            let mut total = existing_value
                .map(|v| u64::from_le_bytes(v.as_ref().try_into().unwrap()))
                .unwrap_or(0);
            for operand in operands {
                total += u64::from_le_bytes(operand.as_ref().try_into().unwrap());
            }
            Ok(Bytes::copy_from_slice(&total.to_le_bytes()))
        }
    }

    #[tokio::test]
    async fn test_txn_basic_visibility() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin transaction
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();

        // Put data from others
        db.put(b"k2", b"v2").await.unwrap();

        // Read within transaction - should see the initial data
        let value = txn.get(b"k1").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v1")));

        // Commit transaction
        txn.commit().await.unwrap();
    }

    #[tokio::test]
    async fn test_txn_write_visibility_in_txn() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin transaction
        let txn = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Write within transaction
        txn.put(b"k1", b"v2").unwrap();

        // Read within transaction - should see the updated value in the transaction
        let value = txn.get(b"k1").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v2")));

        // Commit transaction
        txn.commit().await.unwrap();
    }

    #[tokio::test]
    async fn test_txn_si_commit_conflict() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin first transaction
        let txn1 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn1.put(b"k1", b"v2").unwrap();

        // Begin second transaction
        let txn2 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn2.put(b"k1", b"v3").unwrap();

        // Commit first transaction - should succeed
        txn1.commit().await.unwrap();

        // Commit second transaction - should fail due to conflict
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_si_commit_conflict_with_db_writes() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();

        // Begin first transaction
        let txn1 = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn1.put(b"k1", b"v2").unwrap();

        // DB put on the same key
        db.put(b"k1", b"v3").await.unwrap();

        // Commit transaction - should conflict
        let result = txn1.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_ssi_commit_conflict() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2.1").await.unwrap();

        // Begin first transaction
        let txn1 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        txn1.put(b"k1", b"v2").unwrap();
        txn1.put(b"k2", b"v2.2").unwrap();

        // Begin second transaction
        let txn2 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        let val2 = txn2.get(b"k2").await.unwrap();
        assert_eq!(val2, Some(Bytes::from_static(b"v2.1")));
        txn2.put(b"k3", b"v3").unwrap();

        // Commit first transaction - should succeed
        txn1.commit().await.unwrap();

        // Commit second transaction - should fail due to conflict for reading k2
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_txn_ssi_commit_conflit_with_ranges() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2.1").await.unwrap();
        db.put(b"k3", b"v3").await.unwrap();

        // Begin first transaction
        let txn1 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Begin second transaction
        let txn2 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Transaction 2 scans k2..k3
        {
            let mut iter = txn2.scan(&b"k2"[..]..=&b"k3"[..]).await.unwrap();
            while let Some(_kv) = iter.next().await.unwrap() {
                // Just iterate through the range to track it
            }
        }

        // Transaction 1 writes within the range that transaction 2 scanned
        txn1.put(b"k2", b"v2.2").unwrap();
        txn1.commit().await.unwrap();

        // Transaction 2 tries to write something
        txn2.put(b"k4", b"v4").unwrap();

        // Commit second transaction - should fail due to phantom conflict
        // because it read a range that was modified by transaction 1
        let result = txn2.commit().await;
        assert!(result.is_err());
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
    async fn test_txn_commit_await_durable_false() {
        use crate::config::{DurabilityLevel::*, ReadOptions, WriteOptions};
        use fail_parallel::FailPointRegistry;

        // Setup database with failpoints to pause durable writes
        let fp_registry = Arc::new(FailPointRegistry::new());
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::builder("/tmp/test_txn_commit_await_durable_false", object_store)
            .with_fp_registry(fp_registry.clone())
            .build()
            .await
            .unwrap();

        // Pause durable writes to object storage
        fail_parallel::cfg(fp_registry.clone(), "write-wal-sst-io-error", "pause").unwrap();

        // Begin a transaction and write a key
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.put(b"k", b"v").unwrap();

        // Commit without waiting for durability
        txn.commit_with_options(&WriteOptions {
            await_durable: false,
        })
        .await
        .unwrap();

        // Memory (in-memory) read should see the value
        let val = db
            .get_with_options(b"k", &ReadOptions::new().with_durability_filter(Memory))
            .await
            .unwrap();
        assert_eq!(val, Some(Bytes::from_static(b"v")));

        // Remote (durable) read should not see the value yet
        let val = db
            .get_with_options(b"k", &ReadOptions::new().with_durability_filter(Remote))
            .await
            .unwrap();
        assert_eq!(val, None);

        // Clean up
        fail_parallel::cfg(fp_registry.clone(), "write-wal-sst-io-error", "off").unwrap();
        db.close().await.unwrap();
    }

    // Transaction test structures for table-driven tests
    #[derive(Debug, Clone)]
    struct TransactionTestCase {
        name: &'static str,
        isolation_level: IsolationLevel,
        initial_data: Vec<(Bytes, Bytes)>,
        operations: Vec<TransactionTestOp>,
        expected_results: Vec<TransactionTestOpResult>,
    }

    #[derive(Debug, Clone)]
    #[allow(dead_code)]
    enum TransactionTestOp {
        TxnGet(Bytes),
        TxnScan(Bytes, Bytes),
        TxnPut(Bytes, Bytes),
        TxnDelete(Bytes),
        TxnMarkRead(Bytes),
        TxnCommit,
        TxnRollback,
        DbPut(Bytes, Bytes),
        DbGet(Bytes),
    }

    #[derive(Debug, Clone, PartialEq)]
    enum TransactionTestOpResult {
        Got(Option<Bytes>),
        Scanned(Vec<Bytes>),
        Empty,
        Conflicted,
        Invalid,
    }

    async fn execute_transaction_test_ops(
        db: crate::Db,
        operations: Vec<TransactionTestOp>,
        initial_data: Vec<(Bytes, Bytes)>,
        isolation_level: IsolationLevel,
    ) -> Vec<TransactionTestOpResult> {
        // Setup initial data
        for (key, value) in initial_data {
            db.put(key, value).await.unwrap();
        }

        let mut txn_opt = Some(db.begin(isolation_level).await.unwrap());

        let mut results = Vec::new();
        for operation in operations.iter() {
            let result = match (txn_opt.as_mut(), operation) {
                // Transaction operations with active transaction
                (Some(txn), TransactionTestOp::TxnGet(key)) => {
                    let val = txn.get(key).await.unwrap();
                    TransactionTestOpResult::Got(val)
                }
                (Some(txn), TransactionTestOp::TxnScan(start, end)) => {
                    let mut iter = txn.scan(&start[..]..=&end[..]).await.unwrap();
                    let mut scanned_keys = Vec::new();
                    while let Some(kv) = iter.next().await.unwrap() {
                        scanned_keys.push(kv.key);
                    }
                    TransactionTestOpResult::Scanned(scanned_keys)
                }
                (Some(txn), TransactionTestOp::TxnPut(key, value)) => {
                    txn.put(key, value).unwrap();
                    TransactionTestOpResult::Empty
                }
                (Some(txn), TransactionTestOp::TxnDelete(key)) => {
                    txn.delete(key).unwrap();
                    TransactionTestOpResult::Empty
                }
                (Some(txn), TransactionTestOp::TxnMarkRead(key)) => {
                    txn.mark_read([key]).unwrap();
                    TransactionTestOpResult::Empty
                }
                (Some(_txn), TransactionTestOp::TxnCommit) => {
                    let txn = txn_opt.take().unwrap();
                    match txn.commit().await {
                        Ok(_) => TransactionTestOpResult::Empty,
                        Err(_) => TransactionTestOpResult::Conflicted,
                    }
                }
                (Some(_txn), TransactionTestOp::TxnRollback) => {
                    let txn = txn_opt.take().unwrap();
                    txn.rollback();
                    TransactionTestOpResult::Empty
                }

                // Database operations
                (_, TransactionTestOp::DbPut(key, value)) => {
                    db.put(key, value).await.unwrap();
                    TransactionTestOpResult::Empty
                }
                (_, TransactionTestOp::DbGet(key)) => {
                    let val = db.get(key).await.unwrap();
                    TransactionTestOpResult::Got(val)
                }

                // Invalid operations (transaction operations without active transaction)
                (None, TransactionTestOp::TxnGet(_))
                | (None, TransactionTestOp::TxnScan(_, _))
                | (None, TransactionTestOp::TxnPut(_, _))
                | (None, TransactionTestOp::TxnDelete(_))
                | (None, TransactionTestOp::TxnMarkRead(_))
                | (None, TransactionTestOp::TxnCommit)
                | (None, TransactionTestOp::TxnRollback) => TransactionTestOpResult::Invalid,
            };

            results.push(result);
        }

        results
    }

    // Table-driven tests using rstest
    #[rstest]
    #[case::ssi_basic_visibility(
        TransactionTestCase {
            name: "ssi_basic_visibility",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_visibility_in_txn(
        TransactionTestCase {
            name: "ssi_write_visibility_in_txn",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v2"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_delete_visibility_in_txn(
        TransactionTestCase {
            name: "ssi_delete_visibility_in_txn",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnDelete(Bytes::from("k1")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(None),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_rollback_visibility(
        TransactionTestCase {
            name: "ssi_rollback_visibility",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnRollback,
                TransactionTestOp::DbGet(Bytes::from("k1")),
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
            ]
        }
    )]
    #[case::si_concurrent_read_snapshot(
        TransactionTestCase {
            name: "si_concurrent_read_snapshot",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_write_conflict(
        TransactionTestCase {
            name: "ssi_write_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v3")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_read_write_conflict(
        TransactionTestCase {
            name: "ssi_read_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v1")),
                TransactionTestOp::TxnPut(Bytes::from("k2"), Bytes::from("v2.1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_read_write_no_conflict(
        TransactionTestCase {
            name: "si_read_write_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_write_read_conflict(
        TransactionTestCase {
            name: "ssi_write_read_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnPut(Bytes::from("k3"), Bytes::from("v3")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_write_read_no_conflict(
        TransactionTestCase {
            name: "si_write_read_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_range_write_conflict(
        TransactionTestCase {
            name: "ssi_range_write_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3")),
                (Bytes::from("k4"), Bytes::from("v4")),
                (Bytes::from("k5"), Bytes::from("v5"))
            ],
            operations: vec![
                TransactionTestOp::TxnScan(Bytes::from("k1"), Bytes::from("k5")),
                TransactionTestOp::DbPut(Bytes::from("k3"), Bytes::from("v3_new")),
                TransactionTestOp::TxnPut(Bytes::from("k100"), Bytes::from("v100")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Scanned(vec![Bytes::from("k1"), Bytes::from("k2"), Bytes::from("k3"), Bytes::from("k4"), Bytes::from("k5")]),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_range_write_no_conflict(
        TransactionTestCase {
            name: "si_range_write_no_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3")),
                (Bytes::from("k4"), Bytes::from("v4")),
                (Bytes::from("k5"), Bytes::from("v5"))
            ],
            operations: vec![
                TransactionTestOp::TxnScan(Bytes::from("k1"), Bytes::from("k5")),
                TransactionTestOp::DbPut(Bytes::from("k3"), Bytes::from("v3_new")),
                TransactionTestOp::TxnPut(Bytes::from("k100"), Bytes::from("v100")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Scanned(vec![Bytes::from("k1"), Bytes::from("k2"), Bytes::from("k3"), Bytes::from("k4"), Bytes::from("k5")]),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_mark_read_conflict(
        TransactionTestCase {
            name: "ssi_mark_read_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnPut(Bytes::from("k2"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_mark_read_multiple_keys_conflict(
        TransactionTestCase {
            name: "ssi_mark_read_multiple_keys_conflict",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3"))
            ],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::TxnMarkRead(Bytes::from("k2")),
                TransactionTestOp::DbPut(Bytes::from("k2"), Bytes::from("v2_new")),
                TransactionTestOp::TxnPut(Bytes::from("k4"), Bytes::from("v4")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_mark_read_no_conflict_on_different_key(
        TransactionTestCase {
            name: "ssi_mark_read_no_conflict_on_different_key",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2"))
            ],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k2"), Bytes::from("v2_new")),
                TransactionTestOp::TxnPut(Bytes::from("k3"), Bytes::from("v3")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
            ]
        }
    )]
    #[case::ssi_mark_read_conflict_without_write(
        TransactionTestCase {
            name: "ssi_mark_read_conflict_without_write",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_get_conflict_without_write(
        TransactionTestCase {
            name: "ssi_get_conflict_without_write",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnGet(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Got(Some(Bytes::from("v1"))),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::ssi_scan_conflict_without_write(
        TransactionTestCase {
            name: "ssi_scan_conflict_without_write",
            isolation_level: IsolationLevel::SerializableSnapshot,
            initial_data: vec![
                (Bytes::from("k1"), Bytes::from("v1")),
                (Bytes::from("k2"), Bytes::from("v2")),
                (Bytes::from("k3"), Bytes::from("v3")),
            ],
            operations: vec![
                TransactionTestOp::TxnScan(Bytes::from("k1"), Bytes::from("k3")),
                TransactionTestOp::DbPut(Bytes::from("k2"), Bytes::from("v2_new")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Scanned(vec![
                    Bytes::from("k1"),
                    Bytes::from("k2"),
                    Bytes::from("k3"),
                ]),
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_mark_read_conflict(
        TransactionTestCase {
            name: "si_mark_read_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::DbPut(Bytes::from("k1"), Bytes::from("v2")),
                TransactionTestOp::TxnPut(Bytes::from("k2"), Bytes::from("v2")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[case::si_mark_read_write_write_conflict(
        TransactionTestCase {
            name: "si_mark_read_write_write_conflict",
            isolation_level: IsolationLevel::Snapshot,
            initial_data: vec![(Bytes::from("k1"), Bytes::from("v1"))],
            operations: vec![
                TransactionTestOp::TxnMarkRead(Bytes::from("k1")),
                TransactionTestOp::TxnPut(Bytes::from("k2"), Bytes::from("v2")),
                TransactionTestOp::DbPut(Bytes::from("k2"), Bytes::from("v2_db")),
                TransactionTestOp::TxnCommit,
            ],
            expected_results: vec![
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Empty,
                TransactionTestOpResult::Conflicted,
            ]
        }
    )]
    #[tokio::test]
    async fn test_txn_table_driven(#[case] test_case: TransactionTestCase) {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open(test_case.name, object_store).await.unwrap();

        let initial_data_bytes: Vec<(Bytes, Bytes)> = test_case.initial_data.clone();

        let results = execute_transaction_test_ops(
            db,
            test_case.operations,
            initial_data_bytes,
            test_case.isolation_level,
        )
        .await;

        for (i, (result, expected)) in results
            .iter()
            .zip(test_case.expected_results.iter())
            .enumerate()
        {
            assert_eq!(
                result, expected,
                "Test '{}' failed at operation {}: expected {:?}, got {:?}",
                test_case.name, i, expected, result
            );
        }
    }

    #[tokio::test]
    async fn test_txn_scan_sees_concurrent_put_in_same_txn() {
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_db", object_store).await.unwrap();

        // Put initial data: k1 and k3
        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k3", b"v3").await.unwrap();

        // Begin transaction
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();

        // Test 1: Scan created before put should NOT see the new key or updated value for an existing key
        {
            // Start a scan from k1 to k3 (inclusive)
            let mut iter = txn.scan(&b"k1"[..]..=&b"k3"[..]).await.unwrap();

            // Put k2 in the transaction (after scan has started)
            txn.put(b"k2", b"v2").unwrap();
            // Update k3 within the transaction after the scan has started
            txn.put(b"k3", b"v3_updated").unwrap();

            // Iterate through the results
            let mut results = Vec::new();
            while let Some(kv) = iter.next().await.unwrap() {
                results.push((kv.key.clone(), kv.value.clone()));
            }

            // The iterator should see k1 and k3 (the snapshot at scan time)
            // It should NOT see k2 because the scan was created before k2 was put
            assert_eq!(results.len(), 2);
            assert_eq!(results[0].0, Bytes::from_static(b"k1"));
            assert_eq!(results[0].1, Bytes::from_static(b"v1"));
            assert_eq!(results[1].0, Bytes::from_static(b"k3"));
            assert_eq!(results[1].1, Bytes::from_static(b"v3"));
        } // iter is dropped here

        // Test 2: A new scan after the put should see k2
        {
            let mut iter2 = txn.scan(&b"k1"[..]..=&b"k3"[..]).await.unwrap();
            let mut results2 = Vec::new();
            while let Some(kv) = iter2.next().await.unwrap() {
                results2.push((kv.key.clone(), kv.value.clone()));
            }

            // This new scan should see all three keys and the updated value for k3
            assert_eq!(results2.len(), 3);
            assert_eq!(results2[0].0, Bytes::from_static(b"k1"));
            assert_eq!(results2[1].0, Bytes::from_static(b"k2"));
            assert_eq!(results2[1].1, Bytes::from_static(b"v2"));
            assert_eq!(results2[2].0, Bytes::from_static(b"k3"));
            assert_eq!(results2[2].1, Bytes::from_static(b"v3_updated"));
        } // iter2 is dropped here

        // Commit the transaction
        txn.commit().await.unwrap();

        // Verify k2 is now in the database
        let value = db.get(b"k2").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v2")));
    }

    #[tokio::test]
    async fn test_mark_read_equivalent_to_get_in_ssi() {
        // This test verifies that mark_read() behaves the same as get() in terms of conflict detection in SSI mode.
        // Setup database with initial data
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_mark_read_equivalent", object_store)
            .await
            .unwrap();

        db.put(b"k1", b"v1").await.unwrap();

        // Test 1: Transaction using mark_read() should conflict
        let txn1 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        txn1.mark_read([b"k1"]).unwrap();

        // Another transaction modifies k1
        db.put(b"k1", b"v2").await.unwrap();

        // Transaction 1 tries to write and commit
        txn1.put(b"k2", b"v2").unwrap();
        let result1 = txn1.commit().await;
        assert!(
            result1.is_err(),
            "Transaction with mark_read() should conflict"
        );

        // Reset the database state
        db.put(b"k1", b"v1").await.unwrap();

        // Test 2: Transaction using get() should also conflict (same behavior)
        let txn2 = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        let _ = txn2.get(b"k1").await.unwrap();

        // Another transaction modifies k1
        db.put(b"k1", b"v2_again").await.unwrap();

        // Transaction 2 tries to write and commit
        txn2.put(b"k3", b"v3").unwrap();
        let result2 = txn2.commit().await;
        assert!(result2.is_err(), "Transaction with get() should conflict");

        // Both should have the same behavior: conflict detection
    }

    #[tokio::test]
    async fn test_mark_read_multiple_keys_at_once() {
        // This test verifies that mark_read() can track multiple keys in one call
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_mark_read_multiple", object_store)
            .await
            .unwrap();

        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2").await.unwrap();
        db.put(b"k3", b"v3").await.unwrap();

        let txn = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();

        // Mark multiple keys at once
        txn.mark_read([b"k1", b"k2", b"k3"]).unwrap();

        // Another transaction modifies one of the marked keys
        db.put(b"k2", b"v2_modified").await.unwrap();

        // Transaction tries to commit with a write
        txn.put(b"k4", b"v4").unwrap();
        let result = txn.commit().await;

        assert!(
            result.is_err(),
            "Transaction should conflict because k2 was modified"
        );
    }

    #[tokio::test]
    async fn test_unmark_write_ignores_write_write_conflicts() {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_unmark_write_ww", object_store)
            .await
            .unwrap();

        db.put(b"k1", b"v1").await.unwrap();

        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.put(b"k1", b"v2").unwrap();
        txn.unmark_write([b"k1"]).unwrap();

        db.put(b"k1", b"v3").await.unwrap();

        let result = txn.commit().await;
        assert!(
            result.is_ok(),
            "Transaction should not conflict for untracked write key"
        );

        let value = db.get(b"k1").await.unwrap();
        assert_eq!(value, Some(Bytes::from_static(b"v2")));
    }

    #[tokio::test]
    async fn test_unmark_write_only_excludes_selected_keys() {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_unmark_write_partial", object_store)
            .await
            .unwrap();

        db.put(b"k1", b"v1").await.unwrap();
        db.put(b"k2", b"v2").await.unwrap();

        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.put(b"k1", b"v1_txn").unwrap();
        txn.put(b"k2", b"v2_txn").unwrap();
        txn.unmark_write([b"k1"]).unwrap();

        db.put(b"k2", b"v2_db").await.unwrap();

        let result = txn.commit().await;
        assert!(
            result.is_err(),
            "Transaction should still conflict on tracked key k2"
        );
    }

    #[tokio::test]
    async fn test_unmark_write_avoids_read_write_conflicts_for_others() {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_unmark_write_rw", object_store)
            .await
            .unwrap();

        db.put(b"k1", b"v1").await.unwrap();

        let reader_txn = db
            .begin(IsolationLevel::SerializableSnapshot)
            .await
            .unwrap();
        let _ = reader_txn.get(b"k1").await.unwrap();

        let writer_txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        writer_txn.put(b"k1", b"v2").unwrap();
        writer_txn.unmark_write([b"k1"]).unwrap();
        writer_txn.commit().await.unwrap();

        reader_txn.put(b"k2", b"v2").unwrap();
        let result = reader_txn.commit().await;
        assert!(
            result.is_ok(),
            "Reader transaction should not conflict with untracked write key"
        );
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
    async fn test_unmark_write_merge_counter_aggregates_under_high_concurrency() {
        const CONCURRENT_TXNS: usize = 32;
        const ROUNDS: usize = 20;
        const MERGE_INCREMENT: [u8; 8] = 1u64.to_le_bytes();
        const EXPECTED: u64 = (CONCURRENT_TXNS * ROUNDS) as u64;

        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::builder("test_unmark_write_merge_counter", object_store)
            .with_merge_operator(Arc::new(CounterMergeOperator))
            .build()
            .await
            .unwrap();

        for _ in 0..ROUNDS {
            let barrier = Arc::new(tokio::sync::Barrier::new(CONCURRENT_TXNS));
            let mut handles = Vec::with_capacity(CONCURRENT_TXNS);

            for _ in 0..CONCURRENT_TXNS {
                let db = db.clone();
                let barrier = barrier.clone();
                handles.push(tokio::spawn(async move {
                    barrier.wait().await;
                    let txn = db
                        .begin(IsolationLevel::SerializableSnapshot)
                        .await
                        .unwrap();
                    txn.merge(b"counter", MERGE_INCREMENT).unwrap();
                    txn.unmark_write([b"counter"]).unwrap();
                    txn.commit().await.unwrap();
                }));
            }

            for handle in handles {
                handle.await.unwrap();
            }
        }

        let value = db.get(b"counter").await.unwrap().unwrap();
        let total = u64::from_le_bytes(value.as_ref().try_into().unwrap());
        assert_eq!(total, EXPECTED);
    }

    fn test_db_options(
        min_filter_keys: u32,
        l0_sst_size_bytes: usize,
        compactor_options: Option<crate::config::CompactorOptions>,
    ) -> crate::config::Settings {
        crate::config::Settings {
            flush_interval: None,
            #[cfg(feature = "wal_disable")]
            wal_enabled: true,
            manifest_poll_interval: std::time::Duration::from_secs(3600),
            manifest_update_timeout: std::time::Duration::from_secs(300),
            max_unflushed_bytes: 134_217_728,
            l0_max_ssts: 8,
            l0_flush_parallelism: 1,
            min_filter_keys,
            filter_bits_per_key: 10,
            l0_sst_size_bytes,
            compactor_options,
            compression_codec: None,
            object_store_cache_options: crate::config::ObjectStoreCacheOptions::default(),
            garbage_collector_options: None,
            default_ttl: None,
            block_format: None,
        }
    }

    #[tokio::test]
    async fn test_txn_commit_returns_write_handle() {
        use slatedb_common::clock::MockSystemClock;

        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let path = "/tmp/test_txn_commit_returns_write_handle";
        let clock = Arc::new(MockSystemClock::new());
        let db = crate::Db::builder(path, object_store)
            .with_settings(test_db_options(0, 1024, None))
            .with_system_clock(clock.clone())
            .build()
            .await
            .unwrap();

        // Basic put
        clock.set(100);
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.put(b"key1", b"value1").unwrap();
        let handle = txn
            .commit_with_options(&WriteOptions {
                await_durable: false,
            })
            .await
            .unwrap()
            .unwrap();
        assert_eq!(handle.seqnum(), 1);
        assert_eq!(handle.create_ts(), 100);

        // Put with options (TTL)
        clock.set(200);
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        let put_opts = PutOptions {
            ttl: crate::config::Ttl::ExpireAfter(1000),
        };
        txn.put_with_options(b"key2", b"value2", &put_opts).unwrap();
        let handle = txn
            .commit_with_options(&WriteOptions {
                await_durable: false,
            })
            .await
            .unwrap()
            .unwrap();
        assert_eq!(handle.seqnum(), 2);
        assert_eq!(handle.create_ts(), 200);

        // Delete
        clock.set(300);
        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        txn.delete(b"key1").unwrap();
        let handle = txn
            .commit_with_options(&WriteOptions {
                await_durable: false,
            })
            .await
            .unwrap()
            .unwrap();
        assert_eq!(handle.seqnum(), 3);
        assert_eq!(handle.create_ts(), 300);
    }

    #[tokio::test]
    async fn test_txn_commit_with_options_empty_batch_returns_none() {
        let object_store: Arc<dyn object_store::ObjectStore> = Arc::new(InMemory::new());
        let db = crate::Db::open("test_txn_commit_with_options_empty_batch", object_store)
            .await
            .unwrap();

        let txn = db.begin(IsolationLevel::Snapshot).await.unwrap();
        let result = txn
            .commit_with_options(&WriteOptions {
                await_durable: false,
            })
            .await
            .unwrap();

        assert!(result.is_none());
    }
}