cqlite-core 0.11.0

//! Data.db writer - writes partition and row data
//!
//! Generates the Data.db component with V5CompressedLegacy (NB) format.
//! Maintains partition ordering by Murmur3 token and clustering ordering.
//! Tracks file positions for Index.db generation.
//!
//! Critical requirements:
//! - Partition ordering: By Murmur3 token, then key bytes (enforced by caller)
//! - Clustering ordering: By clustering comparator within partition (enforced by caller)
//! - Row size measurement: After VInt length bytes (Issue #237)
//! - Delta encoding: Uses Statistics.db baseline for timestamps/TTL/deletion times
//!
//! # V5CompressedLegacy Row Format
//!
//! Each row is encoded as:
//! ```text
//! [row_flags: u8]
//! [extended_flags: u8 if ROW_HAS_EXTENDED_FLAGS set]
//! [clustering_prefix: variable if present]
//! [row_size: VInt]                       ← Measured from AFTER this VInt
//! [prev_size: VInt]
//! [timestamp: VInt if ROW_HAS_TIMESTAMP]   ← Delta from min_timestamp
//! [ttl: VInt if ROW_HAS_TTL]              ← Delta from min_ttl
//! [deletion: 2 VInts if ROW_HAS_DELETION] ← local_deletion_time delta + deletion timestamp
//! [column_bitmap: VUInt bitmask of missing columns if NOT ROW_HAS_ALL_COLUMNS]
//! [cell_data...]
//! ```
//!
//! ## Row Flags
//! - `0x04` (HAS_TIMESTAMP): Timestamp delta present
//! - `0x08` (HAS_TTL): TTL delta present
//! - `0x10` (HAS_DELETION): Deletion time present (two VInts)
//! - `0x20` (HAS_ALL_COLUMNS): All columns present (no bitmap)
//! - `0x40` (HAS_COMPLEX_DELETION): Row contains complex column with deletion
//! - `0x80` (HAS_EXTENDED_FLAGS): Extended flags byte follows
//!
//! ## Cell Format
//! ```text
//! [flags: u8]
//! [timestamp: VInt if NOT USE_ROW_TIMESTAMP]  ← Delta from min_timestamp
//! [local_deletion_time: VUInt if deleted/expiring and NOT USE_ROW_TTL]
//! [ttl: VUInt if expiring and NOT USE_ROW_TTL]
//! [value_length: VInt]
//! [value_bytes]
//! ```
//!
//! ## Cell Flags
//! - `0x01` (IS_DELETED): Cell is a tombstone
//! - `0x02` (IS_EXPIRING): TTL fields follow
//! - `0x04` (HAS_EMPTY_VALUE): Zero-length value
//! - `0x08` (USE_ROW_TIMESTAMP): Use row-level timestamp (no timestamp delta)
//! - `0x10` (USE_ROW_TTL): Use row-level TTL (no TTL delta)
//!
//! References:
//! - Cassandra 5.0: `org.apache.cassandra.db.rows.UnfilteredSerializer`
//! - Parser: `cqlite-core/src/storage/sstable/reader/parsing/v5_compressed_legacy.rs`
//! - Format docs: `docs/sstables-definitive-guide/chapters/05-data-db-format.md`

use crate::error::{Error, Result};
use crate::schema::{Column, CqlType, TableSchema};
use crate::storage::serialization::types::TypeSerializer;
use crate::storage::serialization::vint::{encode_signed, encode_unsigned, unsigned_len};
use crate::storage::sstable::writer::stats_writer::StatisticsMetadata;
use crate::storage::write_engine::mutation::{
    ClusteringBound, ClusteringKey, DecoratedKey, Mutation, PartitionKey, PartitionTombstone,
    RangeTombstone, TableId,
};
use crate::types::{ComparatorType, UdtTypeDef, Value};
use std::io::Write;
use std::path::PathBuf;

// Row header flag constants (from V5CompressedLegacy parser)
const ROW_HAS_TIMESTAMP: u8 = 0x04;
const ROW_HAS_TTL: u8 = 0x08;
#[allow(dead_code)]
const ROW_HAS_DELETION: u8 = 0x10;
const ROW_HAS_ALL_COLUMNS: u8 = 0x20;
const ROW_HAS_COMPLEX_DELETION: u8 = 0x40;
const ROW_HAS_EXTENDED_FLAGS: u8 = 0x80;

// Extended flag constants (when ROW_HAS_EXTENDED_FLAGS is set)
const EXTENDED_IS_STATIC: u8 = 0x01;

// Cell flag constants (from V5CompressedLegacy parser)
const CELL_IS_DELETED: u8 = 0x01;
const CELL_IS_EXPIRING: u8 = 0x02;
const CELL_HAS_EMPTY_VALUE: u8 = 0x04;
const CELL_USE_ROW_TIMESTAMP: u8 = 0x08;
#[allow(dead_code)]
const CELL_USE_ROW_TTL: u8 = 0x10;

// Range tombstone marker constants
const IS_MARKER: u8 = 0x02;

// Range tombstone bound kinds.
//
// These are the ordinals of Cassandra's `ClusteringPrefix.Kind` enum
// (ClusteringPrefix.java) — the byte written on disk by
// `ClusteringBoundOrBoundary.Serializer.serialize()`:
//   0 = EXCL_END_BOUND, 1 = INCL_START_BOUND,
//   2 = EXCL_END_INCL_START_BOUNDARY, 3 = STATIC_CLUSTERING,
//   4 = CLUSTERING, 5 = INCL_END_EXCL_START_BOUNDARY,
//   6 = INCL_END_BOUND, 7 = EXCL_START_BOUND.
// (Issue #717: the writer previously used a private 0..5 numbering that no
// Cassandra reader understands.)
const EXCL_END_BOUND: u8 = 0;
const INCL_START_BOUND: u8 = 1;
const INCL_END_BOUND: u8 = 6;
const EXCL_START_BOUND: u8 = 7;

// Partition/row markers
const END_OF_PARTITION: u8 = 0x01;

/// Capacity of the streaming Data.db `BufWriter` (Issue #492).
///
/// Large enough that each flushed partition coalesces into a handful of big
/// `write()` syscalls instead of many small default-8 KB ones, preserving the
/// throughput of the previous single whole-file write while keeping resident
/// memory bounded (this buffer plus one partition's scratch).
const DATA_SINK_BUFFER_BYTES: usize = 1024 * 1024;

/// Data.db component writer
///
/// Writes partitions and rows in V5CompressedLegacy format with delta encoding.
/// Caller must provide partitions in token order and rows in clustering order.
///
/// # Memory model (Issue #492)
///
/// The writer supports two modes that produce **byte-identical** Data.db output:
///
/// * **In-memory mode** (`DataWriter::new`): every partition is appended to the
///   `buffer` scratch and never flushed, so `finish()` returns the full Data.db
///   bytes. Used by unit tests that inspect the produced bytes directly.
///
/// * **Streaming mode** (`DataWriter::with_sink`): each partition is built in the
///   `buffer` scratch, written to a `BufWriter<File>` over the Data.db path, and
///   the scratch is cleared. Peak heap is therefore `O(largest partition)` rather
///   than `O(file)`, keeping a multi-GB compaction within the 128 MB target.
///
/// In both modes the file offset of a partition is `position + buffer.len()`
/// measured before any bytes are written. In streaming mode `buffer` is empty at
/// that point (just cleared) so the offset is `position`; in memory mode
/// `position` is always 0 and `buffer` holds all prior partitions, so the offset
/// equals the legacy `buffer.len()`. The within-partition size math uses relative
/// deltas into `buffer`, which are identical regardless of mode.
#[derive(Debug)]
pub struct DataWriter {
    /// Per-partition scratch buffer for Data.db content.
    ///
    /// In streaming mode this is cleared at the start of every `write_partition`
    /// and flushed to `sink` at the end, so only one partition is resident.
    /// In memory mode it accumulates the entire Data.db output.
    buffer: Vec<u8>,
    /// Streaming sink over the Data.db path (streaming mode only).
    ///
    /// Lazily opened on the first `write_partition` so that the keyspace/table
    /// directory exists before the first byte is written. `None` in in-memory
    /// mode.
    sink: Option<std::io::BufWriter<std::fs::File>>,
    /// Data.db output path (streaming mode only); used for lazy sink open.
    data_path: Option<PathBuf>,
    /// Bytes already flushed to `sink`. Always 0 in in-memory mode.
    position: u64,
    /// Statistics metadata for delta encoding
    stats: StatisticsMetadata,
}

impl DataWriter {
    /// Create a new in-memory Data.db writer.
    ///
    /// All partitions accumulate in `buffer`; `finish()` returns the full bytes.
    /// Prefer [`DataWriter::with_sink`] for production writes to bound memory.
    ///
    /// # Arguments
    /// * `stats` - Statistics metadata for delta encoding baselines
    pub fn new(stats: StatisticsMetadata) -> Self {
        Self {
            buffer: Vec::new(),
            sink: None,
            data_path: None,
            position: 0,
            stats,
        }
    }

    /// Create a streaming Data.db writer that flushes each partition to `data_path`.
    ///
    /// The file is opened lazily on the first `write_partition` (creating the
    /// parent directory if needed) so the keyspace/table layout is established
    /// before any bytes are written. Memory is bounded to the largest single
    /// partition.
    ///
    /// # Arguments
    /// * `stats` - Statistics metadata for delta encoding baselines
    /// * `data_path` - Destination path for the Data.db component
    pub fn with_sink(stats: StatisticsMetadata, data_path: PathBuf) -> Self {
        Self {
            buffer: Vec::new(),
            sink: None,
            data_path: Some(data_path),
            position: 0,
            stats,
        }
    }

    /// Lazily open the streaming sink (and create the parent directory).
    ///
    /// No-op in in-memory mode or once the sink is already open.
    fn ensure_sink(&mut self) -> Result<()> {
        if self.sink.is_some() {
            return Ok(());
        }
        if let Some(path) = self.data_path.clone() {
            if let Some(parent) = path.parent() {
                std::fs::create_dir_all(parent)?;
            }
            let file = std::fs::File::create(&path)?;
            // Use a large BufWriter so a partition's bytes coalesce into a few
            // big write() syscalls rather than many 8 KB-default ones, matching
            // the throughput of the old single whole-file write.
            self.sink = Some(std::io::BufWriter::with_capacity(
                DATA_SINK_BUFFER_BYTES,
                file,
            ));
        }
        Ok(())
    }

    /// In streaming mode, flush the current scratch buffer to the sink, advance
    /// `position`, and clear the scratch so only one partition is ever resident.
    /// No-op in in-memory mode (the scratch keeps accumulating).
    fn flush_partition(&mut self) -> Result<()> {
        if self.data_path.is_none() {
            // In-memory mode: keep accumulating in `buffer`.
            return Ok(());
        }
        self.ensure_sink()?;
        if let Some(sink) = self.sink.as_mut() {
            sink.write_all(&self.buffer)?;
        }
        self.position += self.buffer.len() as u64;
        self.buffer.clear();
        Ok(())
    }

    /// Update the statistics metadata
    ///
    /// This should be called after computing stats from all mutations
    /// but before writing any partition data. The stats are used for
    /// delta encoding of timestamps, TTL, and local deletion times.
    pub fn update_stats(&mut self, stats: StatisticsMetadata) {
        self.stats = stats;
    }

    /// Write a complete partition (partition key + all rows)
    ///
    /// # Arguments
    /// * `key` - Decorated partition key (token + raw bytes)
    /// * `mutations` - All mutations for this partition (must be in clustering order)
    /// * `schema` - Table schema for column metadata
    /// * `partition_tombstone` - Optional partition-level tombstone
    /// * `range_tombstones` - Range tombstones for this partition (must be in clustering order)
    ///
    /// # Returns
    /// File offset where this partition starts (for Index.db)
    pub fn write_partition(
        &mut self,
        key: &DecoratedKey,
        mutations: &[Mutation],
        schema: &TableSchema,
        partition_tombstone: Option<&PartitionTombstone>,
        range_tombstones: &[RangeTombstone],
    ) -> Result<u64> {
        // File offset of this partition = bytes already flushed (`position`) plus
        // whatever is currently buffered. In streaming mode `buffer` is empty at
        // the start of each partition (flushed + cleared by the previous call),
        // so this is `position`; in in-memory mode `position` is 0 and `buffer`
        // holds all prior partitions, matching the legacy `buffer.len()`.
        let partition_offset = self.position + self.buffer.len() as u64;

        // Write partition header (with optional tombstone)
        let header_start = self.buffer.len();
        self.write_partition_header(key, partition_tombstone)?;
        let mut prev_unfiltered_size = (self.buffer.len() - header_start) as u64;

        // SSTables must be internally reconciled: Cassandra's read path and
        // compaction only reconcile rows against deletions from OTHER sources
        // (memtables / other sstables) — a row shadowed by a partition or
        // range tombstone in the SAME sstable is served live. Cassandra's own
        // flush drops shadowed data, and so must we (Issue #716/#717).
        // `partition_floor` is the shadow timestamp from the partition
        // tombstone; per-row floors additionally account for covering range
        // tombstones.
        let partition_floor = partition_tombstone.map(|pt| pt.deletion_time);

        // Cassandra's SerializationHeader.hasStatic() returns true whenever the schema
        // declares any static column — and both the writer and reader unconditionally
        // emit/consume a static-row prelude in that case.  We must do the same.
        let schema_has_static = schema.columns.iter().any(|c| c.is_static);

        if schema_has_static {
            // Collect static-column operations from ALL mutations in this partition,
            // regardless of whether the mutation also carries a clustering key.
            // Last-write-wins by timestamp_micros when the same column appears twice.
            // Static cells shadowed by the partition tombstone are dropped.
            let merged = collect_static_operations(mutations, schema, partition_floor);

            // Mutations shadowed by the partition tombstone cannot contribute
            // the static row's liveness timestamp or TTL either.
            let unshadowed_static = |m: &&Mutation| {
                partition_floor.is_none_or(|floor| m.timestamp_micros > floor)
                    && has_static_operation(m, schema)
            };

            if merged.is_empty() {
                // Schema declares statics but this partition writes none.
                // Cassandra still expects the prelude; emit the minimal empty form.
                prev_unfiltered_size =
                    self.write_empty_static_row(prev_unfiltered_size, schema)? as u64;
            } else {
                // Build a synthetic Mutation carrying the merged static ops.
                // Use the latest timestamp seen across contributing mutations.
                // `!merged.is_empty()` implies at least one mutation contributed
                // an unshadowed static op, so the inner `.max()` is guaranteed `Some`.
                let latest_ts = mutations
                    .iter()
                    .filter(unshadowed_static)
                    .map(|m| m.timestamp_micros)
                    .max()
                    .unwrap_or(mutations.first().map(|m| m.timestamp_micros).unwrap_or(0));

                // Pick the TTL from the mutation with the latest timestamp that
                // contributed a static op (mirrors Cassandra's last-write-wins).
                let ttl = mutations
                    .iter()
                    .filter(unshadowed_static)
                    .max_by_key(|m| m.timestamp_micros)
                    .and_then(|m| m.ttl_seconds);

                let synthetic = Mutation {
                    table: mutations
                        .first()
                        .map(|m| m.table.clone())
                        .unwrap_or_else(|| TableId {
                            keyspace: schema.keyspace.clone(),
                            table: schema.table.clone(),
                        }),
                    partition_key: mutations
                        .first()
                        .map(|m| m.partition_key.clone())
                        .unwrap_or_else(|| PartitionKey {
                            columns: Vec::new(),
                        }),
                    clustering_key: None,
                    operations: merged,
                    timestamp_micros: latest_ts,
                    ttl_seconds: ttl,
                    partition_tombstone: None,
                    range_tombstones: Vec::new(),
                };

                prev_unfiltered_size =
                    self.write_static_row_with_prev_size(&synthetic, schema, prev_unfiltered_size)?
                        as u64;
            }
        }

        // Merge all mutations sharing a clustering key into a single row each
        // (Issue #716/#717: writing one row per mutation produced duplicate
        // rows with equal clustering — e.g. an INSERT row plus a phantom
        // tombstone-carrier row — which is invalid in the OA format). Rows
        // shadowed by the partition tombstone or a covering range tombstone
        // are dropped during the merge.
        let rows = self.merge_clustering_rows(
            mutations,
            schema,
            schema_has_static,
            partition_floor,
            range_tombstones,
        );

        // Interleave merged rows with range tombstone bound markers in
        // clustering order. Cassandra requires every unfiltered (row or
        // marker) of a partition to appear in clustering order; with equal
        // clustering values, inclusive-start/exclusive-end bounds sort before
        // the row and inclusive-end/exclusive-start bounds sort after it
        // (ClusteringPrefix.Kind.comparedToClustering).
        enum PartitionItem<'a> {
            Row(RowWrite<'a>),
            Marker {
                bound: &'a ClusteringBound,
                is_open: bool,
                deletion_time: i64,
                local_deletion_time: i32,
            },
        }

        let mut items: Vec<PartitionItem> = rows.into_iter().map(PartitionItem::Row).collect();
        for rt in range_tombstones {
            items.push(PartitionItem::Marker {
                bound: &rt.start,
                is_open: true,
                deletion_time: rt.deletion_time,
                local_deletion_time: rt.local_deletion_time,
            });
            items.push(PartitionItem::Marker {
                bound: &rt.end,
                is_open: false,
                deletion_time: rt.deletion_time,
                local_deletion_time: rt.local_deletion_time,
            });
        }

        // Sort key: (partition position class, clustering values, bound weight).
        // class: -1 = before all rows (Bottom), 0 = positioned by clustering
        // values, 1 = after all rows (Top).
        fn sort_class<'a, 'b>(item: &'b PartitionItem<'a>) -> (i8, Option<&'b ClusteringKey>, i8) {
            match item {
                PartitionItem::Row(row) => (0, row.clustering_key, 0),
                PartitionItem::Marker { bound, is_open, .. } => match bound {
                    ClusteringBound::Inclusive(ck) => (0, Some(ck), if *is_open { -1 } else { 1 }),
                    ClusteringBound::Exclusive(ck) => (0, Some(ck), if *is_open { 1 } else { -1 }),
                    ClusteringBound::Bottom => (-1, None, 0),
                    ClusteringBound::Top => (1, None, 0),
                },
            }
        }
        items.sort_by(|a, b| {
            let (class_a, ck_a, weight_a) = sort_class(a);
            let (class_b, ck_b, weight_b) = sort_class(b);
            class_a
                .cmp(&class_b)
                .then_with(|| match (ck_a, ck_b) {
                    (Some(x), Some(y)) => x.compare(y, schema).unwrap_or_else(|_| x.cmp(y)),
                    _ => std::cmp::Ordering::Equal,
                })
                .then(weight_a.cmp(&weight_b))
        });

        for item in items {
            prev_unfiltered_size = match item {
                PartitionItem::Row(row) => {
                    self.write_merged_row_with_prev_size(&row, schema, prev_unfiltered_size)? as u64
                }
                PartitionItem::Marker {
                    bound,
                    is_open,
                    deletion_time,
                    local_deletion_time,
                } => self.write_range_bound(
                    bound,
                    is_open,
                    deletion_time,
                    local_deletion_time,
                    schema,
                    prev_unfiltered_size,
                )? as u64,
            };
        }

        // Write end-of-partition marker
        self.buffer.push(END_OF_PARTITION);

        // Streaming mode: flush this partition to disk and clear the scratch so
        // only one partition is ever resident in memory. No-op in memory mode.
        self.flush_partition()?;

        Ok(partition_offset)
    }

    /// Write an empty static-row prelude.
    ///
    /// Required by Cassandra whenever the schema has any static column, even
    /// when this particular partition writes no static cells.
    ///
    /// Binary form:
    /// ```text
    /// [0x80]              ← row_flags = ROW_HAS_EXTENDED_FLAGS only
    /// [0x01]              ← extended_flags = EXTENDED_IS_STATIC
    /// [row_size: VUInt]   ← size of (prev_size VInt + bitmap)
    /// [prev_size: VUInt]
    /// [bitmap: VUInt]     ← all-missing bitmap: (1 << N) - 1 for N static cols
    ///                       (encoded via write_column_subset with empty present set)
    /// ```
    fn write_empty_static_row(&mut self, prev_size: u64, schema: &TableSchema) -> Result<usize> {
        let start_len = self.buffer.len();

        // flags = only HAS_EXTENDED_FLAGS; no timestamp, no TTL, no deletion,
        // no HAS_ALL_COLUMNS, no HAS_COMPLEX_DELETION.
        let flags: u8 = ROW_HAS_EXTENDED_FLAGS;
        self.buffer.push(flags);
        self.buffer.push(EXTENDED_IS_STATIC);

        // Build the row body: just prev_size VInt + column bitmap (all missing).
        let mut body = Vec::new();

        // Column bitmap: "all columns missing" for every static column.
        // write_column_subset with an empty present_set.
        let static_columns = self.static_columns(schema);
        let empty_present: std::collections::HashSet<&str> = std::collections::HashSet::new();
        self.write_column_subset(&mut body, &static_columns, &empty_present)?;

        let prev_size_vint_len = unsigned_len(prev_size);
        let row_body_size = prev_size_vint_len as u64 + body.len() as u64;

        let mut row_size_buf = Vec::new();
        encode_unsigned(row_body_size, &mut row_size_buf);
        self.buffer.extend_from_slice(&row_size_buf);

        encode_unsigned(prev_size, &mut self.buffer);
        self.buffer.extend_from_slice(&body);

        Ok(self.buffer.len() - start_len)
    }

    /// Finish writing and return the Data.db bytes (in-memory mode).
    ///
    /// Only valid for writers created via [`DataWriter::new`]. In streaming mode
    /// the bytes live on disk; use [`DataWriter::finish_streaming`] instead.
    pub fn finish(self) -> Result<Vec<u8>> {
        // Hard guard (not debug_assert!, which compiles out in release): on a
        // streaming writer the bytes live on disk and `buffer` is empty after each
        // partition flush, so returning it would silently yield a 0-byte Data.db.
        if self.data_path.is_some() {
            return Err(Error::InvalidInput(
                "DataWriter::finish() called on a streaming writer; use finish_streaming()"
                    .to_string(),
            ));
        }
        Ok(self.buffer)
    }

    /// Finish a streaming writer: flush the sink to disk and return the total
    /// number of Data.db bytes written (i.e. `data_size`).
    ///
    /// Any residual scratch (there is none in normal operation, since
    /// `write_partition` flushes per partition) is flushed first. Returns an
    /// error if the writer was created in in-memory mode.
    pub fn finish_streaming(mut self) -> Result<u64> {
        if self.data_path.is_none() {
            return Err(Error::InvalidInput(
                "finish_streaming() called on an in-memory DataWriter".to_string(),
            ));
        }
        // Flush any residual scratch (normally empty), then flush the sink so all
        // bytes reach the OS file (the subsequent Digest CRC re-read of the same
        // file sees them via the page cache). This matches the durability of the
        // previous `tokio::fs::write`, which did not fsync either.
        self.flush_partition()?;
        if let Some(mut sink) = self.sink.take() {
            sink.flush()?;
        }
        Ok(self.position)
    }

    /// Write partition header
    ///
    /// Format (V5CompressedLegacy / Cassandra BigFormat):
    /// ```text
    /// [key_length: u16 BE]           ← Partition key length (2-byte unsigned short)
    /// [key_bytes]                    ← Raw partition key bytes
    /// [local_deletion_time: i32 BE]  ← i32::MAX for LIVE (DeletionTime.LIVE)
    /// [deletion_timestamp: i64 BE]   ← i64::MIN for LIVE (DeletionTime.LIVE)
    /// ```
    ///
    /// Note: Cassandra uses `ByteBufferUtil.writeWithShortLength()` for the key,
    /// which is a 2-byte BE unsigned short. There is NO separate flags byte.
    /// DeletionTime.LIVE uses sentinel values (Integer.MAX_VALUE, Long.MIN_VALUE).
    fn write_partition_header(
        &mut self,
        key: &DecoratedKey,
        tombstone: Option<&PartitionTombstone>,
    ) -> Result<()> {
        // Partition key length (u16 BE, matching Cassandra's writeWithShortLength)
        if key.key.len() > 65535 {
            return Err(Error::InvalidInput(format!(
                "Partition key too large: {} bytes (max 65535)",
                key.key.len()
            )));
        }
        self.buffer
            .write_all(&(key.key.len() as u16).to_be_bytes())?;

        // Partition key bytes
        self.buffer.extend_from_slice(&key.key);

        // Partition deletion info
        if let Some(ts) = tombstone {
            // Local deletion time (i32 BE, in seconds)
            self.buffer
                .write_all(&ts.local_deletion_time.to_be_bytes())?;
            // Deletion timestamp (i64 BE, in microseconds)
            self.buffer.write_all(&ts.deletion_time.to_be_bytes())?;
        } else {
            // DeletionTime.LIVE: Cassandra uses (Integer.MAX_VALUE, Long.MIN_VALUE)
            self.buffer.write_all(&i32::MAX.to_be_bytes())?;
            self.buffer.write_all(&i64::MIN.to_be_bytes())?;
        }

        Ok(())
    }

    /// Write a single row
    ///
    /// This implements the V5CompressedLegacy row format with delta encoding.
    #[allow(dead_code)]
    fn write_row(&mut self, mutation: &Mutation, schema: &TableSchema) -> Result<()> {
        self.write_row_with_prev_size(mutation, schema, 0)?;
        Ok(())
    }

    /// Write a single mutation as one row. Thin adapter over the merged-row
    /// path so legacy callers (and unit tests) keep working.
    fn write_row_with_prev_size(
        &mut self,
        mutation: &Mutation,
        schema: &TableSchema,
        prev_size: u64,
    ) -> Result<usize> {
        match Self::merge_row_group(&[mutation], schema, false, None) {
            Some(row) => self.write_merged_row_with_prev_size(&row, schema, prev_size),
            // Nothing to write (e.g. a tombstone-carrier mutation with no ops)
            None => Ok(0),
        }
    }

    /// Group same-clustering mutations of a partition and merge each group
    /// into a single [`RowWrite`].
    ///
    /// Mutations must already be sorted by clustering key (the caller —
    /// `SSTableWriter::write_partition` — sorts them); grouping is by
    /// adjacency. Pure-static mutations are excluded (their cells live in the
    /// static-row prelude), and groups that merge to nothing (e.g. mutations
    /// that exist only to carry partition/range tombstones) produce no row.
    fn merge_clustering_rows<'a>(
        &self,
        mutations: &'a [Mutation],
        schema: &TableSchema,
        skip_static_ops: bool,
        partition_floor: Option<i64>,
        range_tombstones: &[RangeTombstone],
    ) -> Vec<RowWrite<'a>> {
        let row_mutations: Vec<&'a Mutation> = mutations
            .iter()
            .filter(|m| !is_static_row_mutation(m, schema))
            .collect();

        let mut rows = Vec::new();
        let mut start = 0;
        while start < row_mutations.len() {
            let mut end = start + 1;
            while end < row_mutations.len()
                && row_mutations[end].clustering_key == row_mutations[start].clustering_key
            {
                end += 1;
            }

            // Shadow floor for this row: partition tombstone plus any range
            // tombstone covering the group's clustering key.
            let clustering_key = row_mutations[start].clustering_key.as_ref();
            let mut shadow_floor = partition_floor;
            for rt in range_tombstones {
                if range_tombstone_covers(rt, clustering_key, schema) {
                    shadow_floor =
                        Some(shadow_floor.map_or(rt.deletion_time, |f| f.max(rt.deletion_time)));
                }
            }

            if let Some(row) = Self::merge_row_group(
                &row_mutations[start..end],
                schema,
                skip_static_ops,
                shadow_floor,
            ) {
                rows.push(row);
            }
            start = end;
        }
        rows
    }

    /// Merge a group of mutations sharing one clustering key into a single
    /// row, applying Cassandra reconciliation semantics at write time:
    ///
    /// - Row deletion: the newest `DeleteRow` wins; mutations at or before
    ///   the deletion timestamp are shadowed (`DeletionTime.deletes` uses
    ///   `timestamp <= markedForDeleteAt`).
    /// - Cells: last-write-wins per column by timestamp; a tombstone wins a
    ///   timestamp tie (Cassandra cell reconciliation).
    /// - Liveness: from the newest surviving mutation that writes cells, or
    ///   a pure primary-key insert (no ops and no tombstone payload). Pure
    ///   row tombstones carry NO liveness, matching Cassandra's serializer.
    ///
    /// Returns `None` when the group produces no row at all (e.g. a mutation
    /// that exists only to carry a partition or range tombstone, or a row
    /// fully shadowed by the partition/range tombstone `shadow_floor`).
    fn merge_row_group<'a>(
        group: &[&'a Mutation],
        schema: &TableSchema,
        skip_static_ops: bool,
        shadow_floor: Option<i64>,
    ) -> Option<RowWrite<'a>> {
        use crate::storage::write_engine::mutation::CellOperation;

        // Newest row deletion in the group (if any). A row deletion at or
        // before the shadow floor is redundant (the partition/range tombstone
        // already covers it) and is dropped.
        let mut row_deletion: Option<(i64, i32)> = None;
        for m in group {
            let has_delete_row = m
                .operations
                .iter()
                .any(|op| matches!(op, CellOperation::DeleteRow));
            if has_delete_row
                && shadow_floor.is_none_or(|floor| m.timestamp_micros > floor)
                && row_deletion.is_none_or(|(ts, _)| m.timestamp_micros >= ts)
            {
                row_deletion = Some((m.timestamp_micros, (m.timestamp_micros / 1_000_000) as i32));
            }
        }
        // Cells and liveness are shadowed by the strongest covering deletion:
        // the row deletion or the partition/range tombstone floor.
        let deletion_ts = match (row_deletion.map(|(ts, _)| ts), shadow_floor) {
            (Some(a), Some(b)) => Some(a.max(b)),
            (a, b) => a.or(b),
        };

        // Per-column last-write-wins; tombstones win timestamp ties.
        let mut cells: std::collections::HashMap<&'a str, MergedOp<'a>> =
            std::collections::HashMap::new();
        // Liveness: (timestamp, row-level TTL) of the newest contributing mutation
        let mut liveness: Option<(i64, Option<u32>)> = None;

        for m in group {
            // Shadowed entirely by the row deletion
            if deletion_ts.is_some_and(|dts| m.timestamp_micros <= dts) {
                continue;
            }

            let mut contributes_liveness = false;
            for op in &m.operations {
                let column = match op {
                    CellOperation::Write { column, .. }
                    | CellOperation::WriteWithTtl { column, .. }
                    | CellOperation::Delete { column } => column.as_str(),
                    CellOperation::DeleteRow => continue,
                };
                if skip_static_ops && is_static_operation(op, schema) {
                    continue;
                }
                if matches!(
                    op,
                    CellOperation::Write { .. } | CellOperation::WriteWithTtl { .. }
                ) {
                    contributes_liveness = true;
                }

                let candidate = MergedOp {
                    op,
                    timestamp_micros: m.timestamp_micros,
                    row_ttl_seconds: m.ttl_seconds,
                };
                match cells.entry(column) {
                    std::collections::hash_map::Entry::Vacant(entry) => {
                        entry.insert(candidate);
                    }
                    std::collections::hash_map::Entry::Occupied(mut entry) => {
                        let existing = entry.get();
                        let candidate_is_tombstone =
                            matches!(candidate.op, CellOperation::Delete { .. });
                        let wins = candidate.timestamp_micros > existing.timestamp_micros
                            || (candidate.timestamp_micros == existing.timestamp_micros
                                && (candidate_is_tombstone
                                    || !matches!(existing.op, CellOperation::Delete { .. })));
                        if wins {
                            entry.insert(candidate);
                        }
                    }
                }
            }

            // A mutation with no ops and no tombstone payload is a pure
            // primary-key insert: it creates row liveness but no cells.
            let pure_pk_insert = m.operations.is_empty()
                && m.partition_tombstone.is_none()
                && m.range_tombstones.is_empty();
            if (contributes_liveness || pure_pk_insert)
                && liveness.is_none_or(|(ts, _)| m.timestamp_micros >= ts)
            {
                liveness = Some((m.timestamp_micros, m.ttl_seconds));
            }
        }

        let ops: Vec<MergedOp<'a>> = cells.into_values().collect();
        if ops.is_empty() && row_deletion.is_none() && liveness.is_none() {
            return None;
        }

        Some(RowWrite {
            clustering_key: group[0].clustering_key.as_ref(),
            liveness_ts: liveness.map(|(ts, _)| ts),
            ttl_seconds: liveness.and_then(|(_, ttl)| ttl),
            row_deletion,
            ops,
        })
    }

    /// Write one merged row (flags + clustering prefix + sizes + body).
    fn write_merged_row_with_prev_size(
        &mut self,
        row: &RowWrite<'_>,
        schema: &TableSchema,
        prev_size: u64,
    ) -> Result<usize> {
        use crate::storage::write_engine::mutation::CellOperation;

        let start_len = self.buffer.len();

        // Build row header flags
        let mut flags = 0u8;

        if row.row_deletion.is_some() {
            flags |= ROW_HAS_DELETION; // 0x10
        }
        if row.liveness_ts.is_some() {
            flags |= ROW_HAS_TIMESTAMP;
            if row.ttl_seconds.is_some() {
                flags |= ROW_HAS_TTL;
            }
        }

        // All columns present if there is no deletion, all surviving ops are
        // non-NULL writes, and they cover every regular column.
        if row.row_deletion.is_none() {
            let all_writes = row.ops.iter().all(|mop| {
                matches!(
                    mop.op,
                    CellOperation::Write { .. } | CellOperation::WriteWithTtl { .. }
                )
            });
            let has_nulls = row.ops.iter().any(|mop| match mop.op {
                CellOperation::Write { value, .. } | CellOperation::WriteWithTtl { value, .. } => {
                    matches!(value, Value::Null)
                }
                _ => false,
            });
            let regular_column_count = self.regular_columns(schema).len();
            if all_writes && !has_nulls && row.ops.len() == regular_column_count {
                flags |= ROW_HAS_ALL_COLUMNS;
            }
        }

        // Check if any operation targets a complex column (non-frozen collection)
        let has_complex = row.ops.iter().any(|mop| {
            let col_name = match mop.op {
                CellOperation::Write { column, .. }
                | CellOperation::WriteWithTtl { column, .. }
                | CellOperation::Delete { column } => Some(column.as_str()),
                _ => None,
            };
            col_name.is_some_and(|name| {
                schema
                    .columns
                    .iter()
                    .find(|c| c.name == name)
                    .map(|c| is_complex_column(&c.data_type))
                    .unwrap_or(false)
            })
        });
        if has_complex {
            flags |= ROW_HAS_COMPLEX_DELETION;
        }

        // Write row flags
        self.buffer.push(flags);

        // Write clustering prefix if present (before row_size)
        if let Some(clustering_key) = row.clustering_key {
            self.write_clustering_prefix(clustering_key, schema)?;
        }

        // Calculate row body size (everything after row_size VInt)
        let row_body = self.build_merged_row_body(row, schema, flags)?;

        let prev_size_vint_len = unsigned_len(prev_size);

        // Write row_size (VInt) — Cassandra's serializedRowBodySize() includes
        // the prev_unfiltered_size VInt as part of the row body
        let row_body_size = prev_size_vint_len as u64 + row_body.len() as u64;
        let mut row_size_buf = Vec::new();
        encode_unsigned(row_body_size, &mut row_size_buf);
        self.buffer.extend_from_slice(&row_size_buf);

        // Write prev_unfiltered_size (VInt, inside the row body)
        encode_unsigned(prev_size, &mut self.buffer);

        // Write rest of row body
        self.buffer.extend_from_slice(&row_body);

        Ok(self.buffer.len() - start_len)
    }

    /// Write a static row for the current partition
    ///
    /// Static rows contain STATIC column values at partition level.
    /// They use extended flags and have NO clustering prefix.
    ///
    /// # Arguments
    /// * `mutation` - Mutation containing static column values
    /// * `schema` - Table schema for column metadata
    ///
    /// # Binary Format
    /// ```text
    /// [row_flags: u8]        ← 0x80 | other_flags (always HAS_EXTENDED_FLAGS)
    /// [extended_flags: u8]   ← 0x01 (EXTENDED_IS_STATIC)
    /// [row_size: VInt]       ← Size of body after this
    /// [prev_size: VInt]      ← 0 or previous row size
    /// [timestamp: VInt]      ← If HAS_TIMESTAMP (delta)
    /// [ttl: VInt]            ← If HAS_TTL (delta)
    /// [deletion: 2 VInts]    ← If HAS_DELETION
    /// [column_bitmap]        ← If NOT HAS_ALL_COLUMNS
    /// [cell_data...]         ← Static column cells only
    /// ```
    pub fn write_static_row(&mut self, mutation: &Mutation, schema: &TableSchema) -> Result<()> {
        self.write_static_row_with_prev_size(mutation, schema, 0)?;
        Ok(())
    }

    fn write_static_row_with_prev_size(
        &mut self,
        mutation: &Mutation,
        schema: &TableSchema,
        prev_size: u64,
    ) -> Result<usize> {
        let start_len = self.buffer.len();

        // Build row header flags - always includes HAS_EXTENDED_FLAGS for static rows
        let mut flags = ROW_HAS_EXTENDED_FLAGS;

        // Check if this is a row tombstone
        let is_row_tombstone = mutation.operations.iter().any(|op| {
            matches!(
                op,
                crate::storage::write_engine::mutation::CellOperation::DeleteRow
            )
        });

        if is_row_tombstone {
            flags |= ROW_HAS_DELETION;
        }

        // Timestamp is always present for static rows
        flags |= ROW_HAS_TIMESTAMP;

        // TTL if present (not applicable to row tombstones)
        if !is_row_tombstone && mutation.ttl_seconds.is_some() {
            flags |= ROW_HAS_TTL;
        }

        // Check if all static columns are present
        if !is_row_tombstone {
            let all_writes = mutation.operations.iter().all(|op| {
                matches!(
                    op,
                    crate::storage::write_engine::mutation::CellOperation::Write { .. }
                        | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl { .. }
                )
            });
            let has_nulls = mutation.operations.iter().any(|op| match op {
                crate::storage::write_engine::mutation::CellOperation::Write { value, .. }
                | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                    value,
                    ..
                } => {
                    matches!(value, Value::Null)
                }
                _ => false,
            });
            // Count static columns only for static row
            let static_column_count = schema.columns.iter().filter(|c| c.is_static).count();

            if all_writes && !has_nulls && mutation.operations.len() == static_column_count {
                flags |= ROW_HAS_ALL_COLUMNS;
            }
        }

        // Write row flags
        self.buffer.push(flags);

        // Write extended flags - always EXTENDED_IS_STATIC for static rows
        self.buffer.push(EXTENDED_IS_STATIC);

        // NO clustering prefix for static rows (key difference from write_row)

        // Build row body
        let row_body = self.build_static_row_body(mutation, schema, flags)?;

        let prev_size_vint_len = unsigned_len(prev_size);

        // Write row_size (VInt) — includes prev_unfiltered_size VInt + rest of body
        let row_body_size = prev_size_vint_len as u64 + row_body.len() as u64;
        let mut row_size_buf = Vec::new();
        encode_unsigned(row_body_size, &mut row_size_buf);
        self.buffer.extend_from_slice(&row_size_buf);

        // Write prev_unfiltered_size (VInt, inside the row body)
        encode_unsigned(prev_size, &mut self.buffer);

        // Write rest of row body
        self.buffer.extend_from_slice(&row_body);

        Ok(self.buffer.len() - start_len)
    }

    /// Build static row body (everything after row_size VInt)
    ///
    /// Similar to build_row_body but only processes static columns.
    fn build_static_row_body(
        &self,
        mutation: &Mutation,
        schema: &TableSchema,
        flags: u8,
    ) -> Result<Vec<u8>> {
        let mut body = Vec::new();

        // Write timestamp delta (if HAS_TIMESTAMP)
        //
        // Fix #644 (S6): Cassandra writes UNSIGNED VInt for all temporal deltas.
        // SerializationHeader.java:167: out.writeUnsignedVInt(timestamp - stats.minTimestamp)
        if (flags & ROW_HAS_TIMESTAMP) != 0 {
            let timestamp_delta = (mutation.timestamp_micros - self.stats.min_timestamp) as u64;
            encode_unsigned(timestamp_delta, &mut body);
        }

        // Write TTL delta (if HAS_TTL)
        //
        // Fix #644 (S6): Both TTL and LDT deltas are UNSIGNED VInt.
        // SerializationHeader.java:177: out.writeUnsignedVInt32(ttl - stats.minTTL)
        // SerializationHeader.java:172: out.writeUnsignedVInt32(ldt - stats.minLocalDeletionTime)
        if (flags & ROW_HAS_TTL) != 0 {
            if let Some(ttl) = mutation.ttl_seconds {
                let ttl_delta = ttl as i64 - self.stats.min_ttl as i64;
                if ttl_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "TTL {} is less than min_ttl {}",
                        ttl, self.stats.min_ttl
                    )));
                }
                encode_unsigned(ttl_delta as u64, &mut body);

                let local_deletion_time = self.expiring_local_deletion_time(ttl)?;
                let ldt_delta =
                    (local_deletion_time as i64) - (self.stats.min_local_deletion_time as i64);
                if ldt_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "Local deletion time {} is less than min_local_deletion_time {}",
                        local_deletion_time, self.stats.min_local_deletion_time
                    )));
                }
                encode_unsigned(ldt_delta as u64, &mut body);
            }
        }

        // Write deletion (if HAS_DELETION)
        if (flags & ROW_HAS_DELETION) != 0 {
            // Row tombstone: Cassandra canonical order (markedForDeleteAt first, then localDeletionTime)
            // Per SerializationHeader.writeDeletionTime(): writeTimestamp() then writeLocalDeletionTime()
            // Fix #644 (S6): both are UNSIGNED VInt.
            let ts_delta = (mutation.timestamp_micros - self.stats.min_timestamp) as u64;
            encode_unsigned(ts_delta, &mut body);

            let local_deletion_time = (mutation.timestamp_micros / 1_000_000) as i32;
            let ldt_delta =
                local_deletion_time.wrapping_sub(self.stats.min_local_deletion_time) as u32;
            encode_unsigned(ldt_delta as u64, &mut body);

            // Issue #717: the columns subset is NOT optional for tombstone rows.
            // Cassandra's UnfilteredSerializer always reads it after the deletion
            // times whenever HAS_ALL_COLUMNS is unset; omitting it makes the
            // reader consume the next row's bytes as a subset bitmask
            // ("Invalid Columns subset bytes; too many bits set").
            if (flags & ROW_HAS_ALL_COLUMNS) == 0 {
                let static_columns = self.static_columns(schema);
                let empty_present: std::collections::HashSet<&str> =
                    std::collections::HashSet::new();
                self.write_column_subset(&mut body, &static_columns, &empty_present)?;
            }

            // No cells written for row tombstones
            return Ok(body);
        }

        // Write column bitmap (if NOT HAS_ALL_COLUMNS)
        // For static rows, bitmap only covers static columns
        if (flags & ROW_HAS_ALL_COLUMNS) == 0 {
            self.write_static_column_bitmap(&mut body, mutation, schema)?;
        }

        // Write cell data for static columns only
        self.write_static_cells(&mut body, mutation, schema)?;

        Ok(body)
    }

    /// Write column bitmap for static columns only.
    ///
    /// Same Cassandra `Columns.Serializer.serializeSubset()` format as
    /// `write_column_bitmap()` but scoped to static columns.
    fn write_static_column_bitmap(
        &self,
        buf: &mut Vec<u8>,
        mutation: &Mutation,
        schema: &TableSchema,
    ) -> Result<()> {
        // Collect names of columns that are present (non-NULL writes + deletes)
        let present_columns: std::collections::HashSet<&str> = mutation
            .operations
            .iter()
            .filter_map(|op| match op {
                crate::storage::write_engine::mutation::CellOperation::Write { column, value }
                | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                    column,
                    value,
                    ..
                } if !matches!(value, Value::Null) => Some(column.as_str()),
                crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                    Some(column.as_str())
                }
                _ => None,
            })
            .collect();

        let static_columns = self.static_columns(schema);
        self.write_column_subset(buf, &static_columns, &present_columns)
    }

    /// Write cells for static columns only
    fn write_static_cells(
        &self,
        buf: &mut Vec<u8>,
        mutation: &Mutation,
        schema: &TableSchema,
    ) -> Result<()> {
        // Get set of static column names for validation
        let static_column_names: std::collections::HashSet<_> = schema
            .columns
            .iter()
            .filter(|c| c.is_static)
            .map(|c| &c.name)
            .collect();

        for op in self.sorted_operations(mutation, &self.static_columns(schema)) {
            match op {
                crate::storage::write_engine::mutation::CellOperation::Write { column, value } => {
                    // Only write if it's a static column
                    if static_column_names.contains(column) && !matches!(value, Value::Null) {
                        self.write_cell(buf, column, value, mutation.timestamp_micros)?;
                    }
                }
                crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                    column,
                    value,
                    ttl_seconds,
                } => {
                    // Only write if it's a static column
                    if static_column_names.contains(column) && !matches!(value, Value::Null) {
                        self.write_cell_with_ttl(
                            buf,
                            column,
                            value,
                            mutation.timestamp_micros,
                            *ttl_seconds,
                        )?;
                    }
                }
                crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                    // Only process if it's a static column
                    if static_column_names.contains(column) {
                        let local_deletion_time = (mutation.timestamp_micros / 1_000_000) as i32;
                        self.write_tombstone_cell(
                            buf,
                            column,
                            mutation.timestamp_micros,
                            local_deletion_time,
                        )?;
                    }
                }
                crate::storage::write_engine::mutation::CellOperation::DeleteRow => {
                    // Row deletion handled at row level with HAS_DELETION flag
                }
            }
        }

        Ok(())
    }

    /// Build row body (everything after row_size VInt)
    ///
    /// Returns the bytes for: timestamp, TTL, deletion, column bitmap, and cells.
    /// Build a row body from a single mutation (legacy/test entry point).
    /// Routes through the merged-row body builder.
    #[cfg(test)]
    fn build_row_body(
        &self,
        mutation: &Mutation,
        schema: &TableSchema,
        flags: u8,
    ) -> Result<Vec<u8>> {
        let row = Self::merge_row_group(&[mutation], schema, false, None).unwrap_or(RowWrite {
            clustering_key: mutation.clustering_key.as_ref(),
            liveness_ts: Some(mutation.timestamp_micros),
            ttl_seconds: mutation.ttl_seconds,
            row_deletion: None,
            ops: Vec::new(),
        });
        self.build_merged_row_body(&row, schema, flags)
    }

    /// Build a merged row body (everything after the row_size VInt, excluding
    /// the prev_unfiltered_size VInt written by the caller).
    ///
    /// Field order per Cassandra's `UnfilteredSerializer.serializeRowBody`:
    /// liveness timestamp, TTL + expiration LDT, row deletion, columns
    /// subset, then cells. Issue #717: the columns subset is written for
    /// EVERY row lacking HAS_ALL_COLUMNS — including row tombstones.
    fn build_merged_row_body(
        &self,
        row: &RowWrite<'_>,
        schema: &TableSchema,
        flags: u8,
    ) -> Result<Vec<u8>> {
        let mut body = Vec::new();

        // Write timestamp delta (if HAS_TIMESTAMP)
        //
        // Fix #644 (S6): Cassandra writes UNSIGNED VInt for all temporal deltas.
        // SerializationHeader.java:167: out.writeUnsignedVInt(timestamp - stats.minTimestamp)
        if (flags & ROW_HAS_TIMESTAMP) != 0 {
            let liveness_ts = row.liveness_ts.ok_or_else(|| {
                Error::InvalidInput(
                    "ROW_HAS_TIMESTAMP set but row has no liveness timestamp".to_string(),
                )
            })?;
            let timestamp_delta = (liveness_ts - self.stats.min_timestamp) as u64;
            encode_unsigned(timestamp_delta, &mut body);
        }

        // Write TTL delta (if HAS_TTL)
        //
        // Fix #644 (S6): Both TTL and LDT deltas are UNSIGNED VInt.
        // SerializationHeader.java:177: out.writeUnsignedVInt32(ttl - stats.minTTL)
        // SerializationHeader.java:172: out.writeUnsignedVInt32(ldt - stats.minLocalDeletionTime)
        if (flags & ROW_HAS_TTL) != 0 {
            if let Some(ttl) = row.ttl_seconds {
                let ttl_delta = ttl as i64 - self.stats.min_ttl as i64;
                if ttl_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "TTL {} is less than min_ttl {}",
                        ttl, self.stats.min_ttl
                    )));
                }
                encode_unsigned(ttl_delta as u64, &mut body);

                let local_deletion_time = self.expiring_local_deletion_time(ttl)?;
                let ldt_delta =
                    (local_deletion_time as i64) - (self.stats.min_local_deletion_time as i64);
                if ldt_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "Local deletion time {} is less than min_local_deletion_time {}",
                        local_deletion_time, self.stats.min_local_deletion_time
                    )));
                }
                encode_unsigned(ldt_delta as u64, &mut body);
            }
        }

        // Write deletion (if HAS_DELETION)
        if (flags & ROW_HAS_DELETION) != 0 {
            // Row tombstone: Cassandra canonical order (markedForDeleteAt first, then localDeletionTime)
            // Per SerializationHeader.writeDeletionTime(): writeTimestamp() then writeLocalDeletionTime()
            // Fix #644 (S6): both are UNSIGNED VInt.
            let (deletion_ts, local_deletion_time) = row.row_deletion.ok_or_else(|| {
                Error::InvalidInput("ROW_HAS_DELETION set but row has no deletion time".to_string())
            })?;
            let ts_delta = (deletion_ts - self.stats.min_timestamp) as u64;
            encode_unsigned(ts_delta, &mut body);

            let ldt_delta =
                local_deletion_time.wrapping_sub(self.stats.min_local_deletion_time) as u32;
            encode_unsigned(ldt_delta as u64, &mut body);
        }

        // Write column bitmap (if NOT HAS_ALL_COLUMNS).
        // Issue #717: this is written even for row tombstones — Cassandra's
        // deserializer reads the subset right after the deletion times.
        if (flags & ROW_HAS_ALL_COLUMNS) == 0 {
            self.write_merged_column_bitmap(&mut body, &row.ops, schema)?;
        }

        // Write cell data (none survive for pure row tombstones)
        self.write_merged_cells(&mut body, row, schema)?;

        Ok(body)
    }

    /// Write clustering prefix
    ///
    /// Format:
    /// ```text
    /// [header: VInt]              ← 2 bits per clustering column (state)
    /// [value_1: type-specific]    ← Only if state is PRESENT (00)
    /// [value_2: type-specific]
    /// ...
    /// ```
    fn write_clustering_prefix(
        &mut self,
        clustering_key: &crate::storage::write_engine::mutation::ClusteringKey,
        schema: &TableSchema,
    ) -> Result<()> {
        // Build header: 2 bits per column
        // 00 = PRESENT, 01 = EMPTY, 10 = NULL, 11 = reserved
        let mut header = 0u64;
        for (i, (_, value)) in clustering_key.columns.iter().enumerate() {
            let state = match value {
                Value::Null => 2, // NULL
                _ => 0,           // PRESENT
            };
            header |= (state as u64) << (i * 2);
        }

        // Write header as VUInt
        encode_unsigned(header, &mut self.buffer);

        // Write values for PRESENT columns
        for (i, (_, value)) in clustering_key.columns.iter().enumerate() {
            if !matches!(value, Value::Null) {
                // Get clustering column definition
                if i >= schema.clustering_keys.len() {
                    return Err(Error::Schema(format!(
                        "Clustering key has more columns than schema: {} > {}",
                        i + 1,
                        schema.clustering_keys.len()
                    )));
                }
                let cluster_col = &schema.clustering_keys[i];
                let comparator = ComparatorType::from_data_type(&cluster_col.data_type)?;

                // Write value bytes (type-specific encoding)
                let value_bytes = serialize_value_for_clustering(value, &comparator)?;
                self.buffer.extend_from_slice(&value_bytes);
            }
        }

        Ok(())
    }

    /// Write column bitmap
    ///
    /// Cassandra `Columns.Serializer.serializeSubset()` format.
    ///
    /// For <64 regular columns (the common case), this writes a single
    /// unsigned VInt whose bits indicate **missing** columns:
    ///   - bit = 1 → column is MISSING (NULL / not written)
    ///   - bit = 0 → column is PRESENT
    ///   - bitmap = 0 means all columns present (this case is prevented by
    ///     the caller which sets `HAS_ALL_COLUMNS` instead).
    ///
    /// Only regular columns participate in the bitmap — partition key and
    /// clustering key columns are serialized elsewhere.
    #[cfg(test)]
    fn write_column_bitmap(
        &self,
        buf: &mut Vec<u8>,
        mutation: &Mutation,
        schema: &TableSchema,
    ) -> Result<()> {
        // Collect names of columns that are present (non-NULL writes + deletes).
        // Delete operations must be marked as present so the reader parses
        // the tombstone/complex-deletion bytes that write_cells() emits.
        let present_columns: std::collections::HashSet<&str> = mutation
            .operations
            .iter()
            .filter_map(|op| match op {
                crate::storage::write_engine::mutation::CellOperation::Write { column, value }
                | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                    column,
                    value,
                    ..
                } if !matches!(value, Value::Null) => Some(column.as_str()),
                crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                    Some(column.as_str())
                }
                _ => None,
            })
            .collect();

        let regular_columns = self.regular_columns(schema);
        self.write_column_subset(buf, &regular_columns, &present_columns)
    }

    /// Write the columns subset for a merged row's surviving operations.
    ///
    /// Same encoding as [`Self::write_column_bitmap`]; for a pure row
    /// tombstone the ops list is empty, producing the all-missing bitmask.
    fn write_merged_column_bitmap(
        &self,
        buf: &mut Vec<u8>,
        ops: &[MergedOp<'_>],
        schema: &TableSchema,
    ) -> Result<()> {
        use crate::storage::write_engine::mutation::CellOperation;

        let present_columns: std::collections::HashSet<&str> = ops
            .iter()
            .filter_map(|mop| match mop.op {
                CellOperation::Write { column, value }
                | CellOperation::WriteWithTtl { column, value, .. }
                    if !matches!(value, Value::Null) =>
                {
                    Some(column.as_str())
                }
                CellOperation::Delete { column } => Some(column.as_str()),
                _ => None,
            })
            .collect();

        let regular_columns = self.regular_columns(schema);
        self.write_column_subset(buf, &regular_columns, &present_columns)
    }

    /// Get regular (non-PK, non-CK, non-static) columns from schema.
    ///
    /// Cassandra's column bitmap only covers regular columns — partition key
    /// and clustering key columns are serialized separately in the partition
    /// header and clustering prefix. Within the regular set, simple columns
    /// sort before complex columns, then by name.
    fn regular_columns<'a>(&self, schema: &'a TableSchema) -> Vec<&'a Column> {
        self.ordered_columns(schema, |column| {
            !column.is_static
                && !schema.is_partition_key(&column.name)
                && !schema.is_clustering_key(&column.name)
        })
    }

    /// Get static columns from schema in Cassandra serialization-header order.
    fn static_columns<'a>(&self, schema: &'a TableSchema) -> Vec<&'a Column> {
        self.ordered_columns(schema, |column| column.is_static)
    }

    /// Write cells for this row
    ///
    /// Cells are written in alphabetical column name order to match Cassandra's
    /// `Columns` sorting (regular columns are sorted by name).
    /// Write the surviving cells of a merged row, in regular-column order.
    ///
    /// Cells whose timestamp matches the row liveness timestamp use
    /// USE_ROW_TIMESTAMP; cells merged in from other mutations (e.g. a later
    /// single-cell DELETE) carry an explicit timestamp delta.
    fn write_merged_cells(
        &self,
        buf: &mut Vec<u8>,
        row: &RowWrite<'_>,
        schema: &TableSchema,
    ) -> Result<()> {
        use crate::storage::write_engine::mutation::CellOperation;

        for mop in self.sorted_merged_ops(&row.ops, schema) {
            match mop.op {
                CellOperation::Write { column, value } => {
                    // Skip NULL values - they are represented by absence in the bitmap
                    if matches!(value, Value::Null) {
                        continue;
                    }
                    // Check if this column is a complex column (non-frozen collection)
                    let is_complex = schema
                        .columns
                        .iter()
                        .find(|c| c.name == *column)
                        .map(|c| is_complex_column(&c.data_type))
                        .unwrap_or(false);

                    if is_complex {
                        let col = schema
                            .columns
                            .iter()
                            .find(|c| c.name == *column)
                            .ok_or_else(|| {
                                Error::Schema(format!(
                                    "Complex column '{}' not found in schema",
                                    column
                                ))
                            })?;
                        self.write_complex_column(buf, col, value, mop.timestamp_micros, None)?;
                    } else if let Some(ttl_seconds) = mop.row_ttl_seconds {
                        if row.ttl_seconds == Some(ttl_seconds)
                            && row.liveness_ts == Some(mop.timestamp_micros)
                        {
                            self.write_cell_with_row_ttl(
                                buf,
                                column,
                                value,
                                mop.timestamp_micros,
                                ttl_seconds,
                            )?;
                        } else {
                            self.write_cell_with_ttl(
                                buf,
                                column,
                                value,
                                mop.timestamp_micros,
                                ttl_seconds,
                            )?;
                        }
                    } else if row.liveness_ts == Some(mop.timestamp_micros) {
                        self.write_cell(buf, column, value, mop.timestamp_micros)?;
                    } else {
                        self.write_cell_explicit_ts(buf, column, value, mop.timestamp_micros)?;
                    }
                }
                CellOperation::WriteWithTtl {
                    column,
                    value,
                    ttl_seconds,
                } => {
                    // Skip NULL values - they are represented by absence in the bitmap
                    if matches!(value, Value::Null) {
                        continue;
                    }
                    let is_complex = schema
                        .columns
                        .iter()
                        .find(|c| c.name == *column)
                        .map(|c| is_complex_column(&c.data_type))
                        .unwrap_or(false);

                    if is_complex {
                        let col = schema
                            .columns
                            .iter()
                            .find(|c| c.name == *column)
                            .ok_or_else(|| {
                                Error::Schema(format!(
                                    "Complex column '{}' not found in schema",
                                    column
                                ))
                            })?;
                        self.write_complex_column(
                            buf,
                            col,
                            value,
                            mop.timestamp_micros,
                            Some(*ttl_seconds),
                        )?;
                    } else {
                        self.write_cell_with_ttl(
                            buf,
                            column,
                            value,
                            mop.timestamp_micros,
                            *ttl_seconds,
                        )?;
                    }
                }
                CellOperation::Delete { column } => {
                    let is_complex = schema
                        .columns
                        .iter()
                        .find(|c| c.name == *column)
                        .map(|c| is_complex_column(&c.data_type))
                        .unwrap_or(false);

                    if is_complex {
                        // Complex column deletion: write empty complex column
                        // with active deletion time (not LIVE)
                        self.write_complex_column_deletion(buf, mop.timestamp_micros)?;
                    } else {
                        let local_deletion_time = (mop.timestamp_micros / 1_000_000) as i32;
                        self.write_tombstone_cell(
                            buf,
                            column,
                            mop.timestamp_micros,
                            local_deletion_time,
                        )?;
                    }
                }
                CellOperation::DeleteRow => {
                    // Row deletion handled at row level with HAS_DELETION flag
                }
            }
        }

        Ok(())
    }

    /// Write a complex column (non-frozen collection stored as multiple cells).
    ///
    /// Complex columns use the following wire format:
    /// ```text
    /// [complex_deletion: marked_for_delete_at (signed VInt) + local_deletion_time (unsigned VInt)]
    /// [cell_count: unsigned VInt]
    /// For each cell:
    ///   [flags: u8]
    ///   [cell_path_length: unsigned VInt]
    ///   [cell_path_bytes]
    ///   [value_length: unsigned VInt]  (if not HAS_EMPTY_VALUE)
    ///   [value_bytes]
    /// ```
    ///
    /// Per collection type:
    /// - SET<T>: cell_path = serialized element, value = empty (HAS_EMPTY_VALUE)
    /// - MAP<K,V>: cell_path = serialized key, value = serialized value
    /// - LIST<T>: cell_path = 16-byte TimeUUID, value = serialized element
    fn write_complex_column(
        &self,
        buf: &mut Vec<u8>,
        column: &Column,
        value: &Value,
        timestamp_micros: i64,
        ttl_seconds: Option<u32>,
    ) -> Result<()> {
        // Write complex deletion time: DeletionTime.LIVE
        // Cassandra canonical order: markedForDeleteAt first, then localDeletionTime
        // Per SerializationHeader.writeDeletionTime(): writeTimestamp() then writeLocalDeletionTime()
        // Fix #644 (S6): markedForDeleteAt delta is UNSIGNED VInt.
        // DeletionTime.LIVE.markedForDeleteAt = Long.MIN_VALUE; delta wraps to large positive u64.
        let ts_delta = i64::MIN.wrapping_sub(self.stats.min_timestamp) as u64;
        encode_unsigned(ts_delta, buf);
        // localDeletionTime delta = Integer.MAX_VALUE - stats.min_local_deletion_time (unsigned VInt)
        let ldt_delta = i32::MAX.wrapping_sub(self.stats.min_local_deletion_time) as u32;
        encode_unsigned(ldt_delta as u64, buf);

        let dt = column.data_type.to_lowercase();

        if dt.starts_with("set<") || dt.starts_with("org.apache.cassandra.db.marshal.settype(") {
            self.write_set_complex_cells(buf, value, timestamp_micros, ttl_seconds)?;
        } else if dt.starts_with("map<")
            || dt.starts_with("org.apache.cassandra.db.marshal.maptype(")
        {
            self.write_map_complex_cells(buf, value, timestamp_micros, ttl_seconds)?;
        } else if dt.starts_with("list<")
            || dt.starts_with("org.apache.cassandra.db.marshal.listtype(")
        {
            self.write_list_complex_cells(buf, value, timestamp_micros, ttl_seconds)?;
        } else {
            return Err(Error::InvalidInput(format!(
                "Column '{}' has type '{}' which is not a recognized complex column type",
                column.name, column.data_type
            )));
        }

        Ok(())
    }

    /// Write a complex column deletion (delete all elements of a collection).
    ///
    /// Wire format: active deletion time + zero cells.
    /// Per SerializationHeader.writeDeletionTime(): timestamp first, LDT second.
    /// ```text
    /// [marked_for_delete_at: unsigned VInt]  ← mutation timestamp (delta from min)
    /// [local_deletion_time: unsigned VInt]   ← seconds since epoch (delta from min)
    /// [cell_count: unsigned VInt]            ← 0 (no cells)
    /// ```
    fn write_complex_column_deletion(
        &self,
        buf: &mut Vec<u8>,
        timestamp_micros: i64,
    ) -> Result<()> {
        // Active deletion: Cassandra canonical order (markedForDeleteAt first, then localDeletionTime)
        // Per SerializationHeader.writeDeletionTime(): writeTimestamp() then writeLocalDeletionTime()
        // Fix #644 (S6): marked_for_delete_at delta is UNSIGNED VInt.
        let ts_delta = (timestamp_micros - self.stats.min_timestamp) as u64;
        encode_unsigned(ts_delta, buf);

        // local_deletion_time = mutation timestamp as seconds (unsigned VInt delta)
        let local_deletion_time = (timestamp_micros / 1_000_000) as i32;
        let ldt_delta = local_deletion_time.wrapping_sub(self.stats.min_local_deletion_time) as u32;
        encode_unsigned(ldt_delta as u64, buf);

        // Zero cells
        encode_unsigned(0u64, buf);

        Ok(())
    }

    /// Write per-cell TTL fields for a complex cell.
    ///
    /// When TTL is present, writes:
    /// - flags: CELL_IS_EXPIRING (0x02), NO USE_ROW_TIMESTAMP
    /// - timestamp delta (unsigned VInt; fix #644: all temporal deltas are unsigned)
    /// - local_deletion_time delta (unsigned VInt)
    /// - TTL delta (unsigned VInt)
    ///
    /// When TTL is absent, writes:
    /// - flags: base_flags | CELL_USE_ROW_TIMESTAMP (0x08)
    ///
    /// Returns the flags byte written (for caller to check HAS_EMPTY_VALUE etc.).
    fn write_complex_cell_header(
        &self,
        buf: &mut Vec<u8>,
        base_flags: u8,
        timestamp_micros: i64,
        ttl_seconds: Option<u32>,
    ) -> Result<()> {
        match ttl_seconds {
            Some(ttl) => {
                // Expiring cell: IS_EXPIRING flag, explicit timestamp + LDT + TTL
                let flags = base_flags | CELL_IS_EXPIRING;
                buf.push(flags);

                // Timestamp delta (UNSIGNED VInt, NOT USE_ROW_TIMESTAMP)
                // Fix #644 (S6): SerializationHeader.java:167 uses writeUnsignedVInt.
                let timestamp_delta = (timestamp_micros - self.stats.min_timestamp) as u64;
                encode_unsigned(timestamp_delta, buf);

                // local_deletion_time = now + ttl
                let now_seconds = std::time::SystemTime::now()
                    .duration_since(std::time::UNIX_EPOCH)
                    .map_err(|e| Error::Storage(format!("System time error: {}", e)))?
                    .as_secs() as i32;
                let local_deletion_time = now_seconds.saturating_add(ttl as i32);
                let ldt_delta =
                    (local_deletion_time as i64) - (self.stats.min_local_deletion_time as i64);
                if ldt_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "Complex cell: local deletion time {} is less than min_local_deletion_time {}",
                        local_deletion_time, self.stats.min_local_deletion_time
                    )));
                }
                encode_unsigned(ldt_delta as u64, buf);

                // TTL delta
                let ttl_delta = (ttl as i64) - (self.stats.min_ttl as i64);
                if ttl_delta < 0 {
                    return Err(Error::InvalidInput(format!(
                        "Complex cell: TTL {} is less than min_ttl {}",
                        ttl, self.stats.min_ttl
                    )));
                }
                encode_unsigned(ttl_delta as u64, buf);
            }
            None => {
                // Non-expiring cell: use row timestamp
                buf.push(base_flags | CELL_USE_ROW_TIMESTAMP);
            }
        }
        Ok(())
    }

    /// Write SET complex cells.
    ///
    /// SET elements: cell_path = serialized element value, cell value = empty (HAS_EMPTY_VALUE).
    /// Elements are sorted by their serialized byte representation for Cassandra compatibility.
    fn write_set_complex_cells(
        &self,
        buf: &mut Vec<u8>,
        value: &Value,
        timestamp_micros: i64,
        ttl_seconds: Option<u32>,
    ) -> Result<()> {
        let elements = match value {
            Value::Set(elements) => elements,
            _ => {
                return Err(Error::InvalidInput(format!(
                    "Expected Set value for complex SET column, got {:?}",
                    value
                )))
            }
        };

        // Serialize all elements first, then sort by byte representation.
        // serialize_value rejects Value::Null, enforcing CQL semantics.
        let mut serialized: Vec<Vec<u8>> = elements
            .iter()
            .map(|e| serialize_collection_element(e, "SET"))
            .collect::<Result<Vec<_>>>()?;
        serialized.sort();

        // Cell count
        encode_unsigned(serialized.len() as u64, buf);

        for path_bytes in &serialized {
            // Cell header: flags + optional TTL fields
            self.write_complex_cell_header(
                buf,
                CELL_HAS_EMPTY_VALUE,
                timestamp_micros,
                ttl_seconds,
            )?;

            // Cell path: serialized element value
            encode_unsigned(path_bytes.len() as u64, buf);
            buf.extend_from_slice(path_bytes);

            // No value bytes (HAS_EMPTY_VALUE flag set)
        }

        Ok(())
    }

    /// Write MAP complex cells.
    ///
    /// MAP entries: cell_path = serialized key, cell value = serialized value.
    /// Entries are sorted by their serialized key byte representation for Cassandra compatibility.
    fn write_map_complex_cells(
        &self,
        buf: &mut Vec<u8>,
        value: &Value,
        timestamp_micros: i64,
        ttl_seconds: Option<u32>,
    ) -> Result<()> {
        let entries = match value {
            Value::Map(entries) => entries,
            _ => {
                return Err(Error::InvalidInput(format!(
                    "Expected Map value for complex MAP column, got {:?}",
                    value
                )))
            }
        };

        // Serialize all keys and values, then sort by serialized key bytes.
        // Null keys are rejected inline; null values are allowed for MAP.
        let mut serialized: Vec<(Vec<u8>, Vec<u8>)> = entries
            .iter()
            .map(|(key, val)| {
                if matches!(key, Value::Null) {
                    return Err(Error::InvalidInput(
                        "MAP keys cannot be null (CQL semantics)".to_string(),
                    ));
                }
                Ok((serialize_value(key)?, serialize_value(val)?))
            })
            .collect::<Result<Vec<_>>>()?;
        serialized.sort_by(|a, b| a.0.cmp(&b.0));

        // Cell count
        encode_unsigned(serialized.len() as u64, buf);

        for (path_bytes, value_bytes) in &serialized {
            // Cell header: flags + optional TTL fields
            self.write_complex_cell_header(buf, 0, timestamp_micros, ttl_seconds)?;

            // Cell path: serialized key
            encode_unsigned(path_bytes.len() as u64, buf);
            buf.extend_from_slice(path_bytes);

            // Cell value: serialized value
            encode_unsigned(value_bytes.len() as u64, buf);
            buf.extend_from_slice(value_bytes);
        }

        Ok(())
    }

    /// Write LIST complex cells.
    ///
    /// LIST elements: cell_path = 16-byte TimeUUID, cell value = serialized element.
    /// Lists preserve insertion order (no sorting) — TimeUUIDs provide ordering.
    fn write_list_complex_cells(
        &self,
        buf: &mut Vec<u8>,
        value: &Value,
        timestamp_micros: i64,
        ttl_seconds: Option<u32>,
    ) -> Result<()> {
        let elements = match value {
            Value::List(elements) => elements,
            _ => {
                return Err(Error::InvalidInput(format!(
                    "Expected List value for complex LIST column, got {:?}",
                    value
                )))
            }
        };

        // Cell count
        encode_unsigned(elements.len() as u64, buf);

        for (i, elem) in elements.iter().enumerate() {
            // Reject null elements inline (CQL semantics)
            if matches!(elem, Value::Null) {
                return Err(Error::InvalidInput(
                    "LIST elements cannot be null (CQL semantics)".to_string(),
                ));
            }

            // Cell header: flags + optional TTL fields
            self.write_complex_cell_header(buf, 0, timestamp_micros, ttl_seconds)?;

            // Cell path: 16-byte TimeUUID
            let timeuuid = generate_list_cell_path_timeuuid(timestamp_micros, i as u64);
            encode_unsigned(16u64, buf);
            buf.extend_from_slice(&timeuuid);

            // Cell value: serialized element
            let value_bytes = serialize_value(elem)?;
            encode_unsigned(value_bytes.len() as u64, buf);
            buf.extend_from_slice(&value_bytes);
        }

        Ok(())
    }

    /// Write a single cell
    ///
    /// Format:
    /// ```text
    /// [flags: u8]
    /// [timestamp_delta: VInt if NOT USE_ROW_TIMESTAMP]
    /// [value_length: VInt]
    /// [value_bytes]
    /// ```
    ///
    /// NOTE: NULL values should NOT be written - they are represented by absence in the bitmap.
    /// This function will return an error if called with Value::Null.
    fn write_cell(
        &self,
        buf: &mut Vec<u8>,
        column: &str,
        value: &Value,
        timestamp: i64,
    ) -> Result<()> {
        // NULL values should not be written as cells - they are represented by absence
        if matches!(value, Value::Null) {
            return Err(Error::InvalidInput(format!(
                "NULL values should not be written as cells (column: {}). They are represented by absence in the bitmap.",
                column
            )));
        }

        // Cell flags
        let mut flags = CELL_USE_ROW_TIMESTAMP; // Use row timestamp by default

        // Empty string: set HAS_EMPTY_VALUE flag
        // This is for actual empty strings (''), not NULLs
        let is_empty_string = matches!(value, Value::Text(s) if s.is_empty());
        if is_empty_string {
            flags |= CELL_HAS_EMPTY_VALUE;
        }

        buf.push(flags);

        // Timestamp (skip if USE_ROW_TIMESTAMP)
        // Fix #644 (S6): Cell timestamp delta is UNSIGNED VInt per Cassandra
        // SerializationHeader.java:167: out.writeUnsignedVInt(timestamp - stats.minTimestamp).
        if (flags & CELL_USE_ROW_TIMESTAMP) == 0 {
            let timestamp_delta = (timestamp - self.stats.min_timestamp) as u64;
            encode_unsigned(timestamp_delta, buf);
        }

        if (flags & CELL_HAS_EMPTY_VALUE) != 0 {
            return Ok(());
        }

        // Value
        let value_bytes = serialize_value(value)?;

        // Bounds check: value length must fit in i64
        if value_bytes.len() > i64::MAX as usize {
            return Err(Error::InvalidInput(format!(
                "Value too large for column '{}': {} bytes (max {})",
                column,
                value_bytes.len(),
                i64::MAX
            )));
        }

        if cell_value_uses_length_prefix(value) {
            encode_unsigned(value_bytes.len() as u64, buf);
        }

        // Write value bytes
        buf.extend_from_slice(&value_bytes);

        Ok(())
    }

    /// Write a live cell that carries its own timestamp (no USE_ROW_TIMESTAMP).
    ///
    /// Used for cells merged into a row from a different mutation than the
    /// one providing the row's liveness timestamp.
    ///
    /// Format:
    /// ```text
    /// [flags: u8]                ← 0x00 (or HAS_EMPTY_VALUE for empty text)
    /// [timestamp_delta: VUInt]   ← delta from min_timestamp
    /// [value_length: VInt]       ← variable-length types only
    /// [value_bytes]
    /// ```
    fn write_cell_explicit_ts(
        &self,
        buf: &mut Vec<u8>,
        column: &str,
        value: &Value,
        timestamp: i64,
    ) -> Result<()> {
        if matches!(value, Value::Null) {
            return Err(Error::InvalidInput(format!(
                "NULL values should not be written as cells (column: {}). They are represented by absence in the bitmap.",
                column
            )));
        }

        let mut flags = 0u8;
        if matches!(value, Value::Text(s) if s.is_empty()) {
            flags |= CELL_HAS_EMPTY_VALUE;
        }
        buf.push(flags);

        // Timestamp delta (UNSIGNED VInt)
        let timestamp_delta = (timestamp - self.stats.min_timestamp) as u64;
        encode_unsigned(timestamp_delta, buf);

        if (flags & CELL_HAS_EMPTY_VALUE) != 0 {
            return Ok(());
        }

        let value_bytes = serialize_value(value)?;
        if value_bytes.len() > i64::MAX as usize {
            return Err(Error::InvalidInput(format!(
                "Value too large for column '{}': {} bytes (max {})",
                column,
                value_bytes.len(),
                i64::MAX
            )));
        }

        if cell_value_uses_length_prefix(value) {
            encode_unsigned(value_bytes.len() as u64, buf);
        }

        buf.extend_from_slice(&value_bytes);
        Ok(())
    }

    /// Write a cell with TTL (expiring cell)
    ///
    /// Format:
    /// ```text
    /// [flags: u8]                    ← CELL_IS_EXPIRING (0x02) set
    /// [timestamp_delta: VInt]        ← Delta from min_timestamp (NOT USE_ROW_TIMESTAMP for TTL cells)
    /// [local_deletion_time_delta: VUInt]  ← When the cell expires (relative to min_local_deletion_time)
    /// [ttl_delta: VUInt]            ← TTL value (relative to min_ttl)
    /// [value_length: VInt]
    /// [value_bytes]
    /// ```
    ///
    /// CRITICAL: TTL cells MUST NOT use USE_ROW_TIMESTAMP or USE_ROW_TTL flags.
    /// They need explicit timestamp and TTL deltas.
    fn write_cell_with_ttl(
        &self,
        buf: &mut Vec<u8>,
        column: &str,
        value: &Value,
        timestamp: i64,
        ttl_seconds: u32,
    ) -> Result<()> {
        // NULL values should not be written as cells
        if matches!(value, Value::Null) {
            return Err(Error::InvalidInput(format!(
                "NULL values should not be written as cells (column: {}). They are represented by absence in the bitmap.",
                column
            )));
        }

        let local_deletion_time = self.expiring_local_deletion_time(ttl_seconds)?;

        // Cell flags - CELL_IS_EXPIRING, NO USE_ROW_TIMESTAMP or USE_ROW_TTL
        let mut flags = CELL_IS_EXPIRING;
        if matches!(value, Value::Text(s) if s.is_empty()) {
            flags |= CELL_HAS_EMPTY_VALUE;
        }
        buf.push(flags);

        // Timestamp delta (required for expiring cells)
        // Fix #644 (S6): Cell timestamp delta is UNSIGNED VInt.
        // SerializationHeader.java:167: out.writeUnsignedVInt(timestamp - stats.minTimestamp)
        let timestamp_delta = (timestamp - self.stats.min_timestamp) as u64;
        encode_unsigned(timestamp_delta, buf);

        // Local deletion time delta
        let ldt_delta = (local_deletion_time as i64) - (self.stats.min_local_deletion_time as i64);
        if ldt_delta < 0 {
            return Err(Error::InvalidInput(format!(
                "Local deletion time {} is less than min_local_deletion_time {}",
                local_deletion_time, self.stats.min_local_deletion_time
            )));
        }
        encode_unsigned(ldt_delta as u64, buf);

        // TTL delta
        let ttl_delta = (ttl_seconds as i64) - (self.stats.min_ttl as i64);
        if ttl_delta < 0 {
            return Err(Error::InvalidInput(format!(
                "TTL {} is less than min_ttl {}",
                ttl_seconds, self.stats.min_ttl
            )));
        }
        encode_unsigned(ttl_delta as u64, buf);

        if (flags & CELL_HAS_EMPTY_VALUE) != 0 {
            return Ok(());
        }

        // Value
        let value_bytes = serialize_value(value)?;

        // Bounds check: value length must fit in i64
        if value_bytes.len() > i64::MAX as usize {
            return Err(Error::InvalidInput(format!(
                "Value too large for column '{}': {} bytes (max {})",
                column,
                value_bytes.len(),
                i64::MAX
            )));
        }

        if cell_value_uses_length_prefix(value) {
            encode_unsigned(value_bytes.len() as u64, buf);
        }

        // Write value bytes
        buf.extend_from_slice(&value_bytes);

        Ok(())
    }

    fn write_cell_with_row_ttl(
        &self,
        buf: &mut Vec<u8>,
        column: &str,
        value: &Value,
        _timestamp: i64,
        _ttl_seconds: u32,
    ) -> Result<()> {
        if matches!(value, Value::Null) {
            return Err(Error::InvalidInput(format!(
                "NULL values should not be written as cells (column: {}). They are represented by absence in the bitmap.",
                column
            )));
        }

        let mut flags = CELL_IS_EXPIRING | CELL_USE_ROW_TIMESTAMP | CELL_USE_ROW_TTL;
        if matches!(value, Value::Text(s) if s.is_empty()) {
            flags |= CELL_HAS_EMPTY_VALUE;
        }
        buf.push(flags);

        if (flags & CELL_HAS_EMPTY_VALUE) != 0 {
            return Ok(());
        }

        let value_bytes = serialize_value(value)?;
        if value_bytes.len() > i64::MAX as usize {
            return Err(Error::InvalidInput(format!(
                "Value too large for column '{}': {} bytes (max {})",
                column,
                value_bytes.len(),
                i64::MAX
            )));
        }

        if cell_value_uses_length_prefix(value) {
            encode_unsigned(value_bytes.len() as u64, buf);
        }

        buf.extend_from_slice(&value_bytes);
        Ok(())
    }

    fn expiring_local_deletion_time(&self, ttl_seconds: u32) -> Result<i32> {
        let now_seconds = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .map_err(|e| Error::Storage(format!("System time error: {}", e)))?
            .as_secs() as i32;
        Ok(now_seconds.saturating_add(ttl_seconds as i32))
    }

    /// Write a tombstone cell
    ///
    /// Tombstones require:
    /// - IS_DELETED flag set
    /// - Own timestamp (NOT USE_ROW_TIMESTAMP - tombstones need explicit timestamps)
    /// - local_deletion_time field
    /// - No value data
    fn write_tombstone_cell(
        &self,
        buf: &mut Vec<u8>,
        _column: &str,
        timestamp: i64,
        local_deletion_time: i32,
    ) -> Result<()> {
        // Cell flags for tombstone
        // CRITICAL: Do NOT set USE_ROW_TIMESTAMP - tombstones need their own timestamp
        //
        // Issue #716: HAS_EMPTY_VALUE MUST be set. Cassandra's Cell.Serializer
        // derives `hasValue = (flags & HAS_EMPTY_VALUE_MASK) == 0`, so a deleted
        // cell without this flag makes Cassandra read a value that was never
        // written, desyncing the row stream (EOFException on readback).
        let flags = CELL_IS_DELETED | CELL_HAS_EMPTY_VALUE;
        buf.push(flags);

        // Timestamp delta (VInt) - required for tombstones
        // Fix #644 (S6): tombstone timestamp delta is UNSIGNED VInt per Cassandra.
        // SerializationHeader.java:167: out.writeUnsignedVInt(timestamp - stats.minTimestamp)
        let timestamp_delta = (timestamp - self.stats.min_timestamp) as u64;
        encode_unsigned(timestamp_delta, buf);

        // Local deletion time delta (VUInt) - required for tombstones
        let deletion_time_delta =
            (local_deletion_time as i64) - (self.stats.min_local_deletion_time as i64);
        if deletion_time_delta < 0 {
            return Err(Error::InvalidInput(format!(
                "Local deletion time {} is less than min_local_deletion_time {}",
                local_deletion_time, self.stats.min_local_deletion_time
            )));
        }
        encode_unsigned(deletion_time_delta as u64, buf);

        // No value length or value bytes for tombstones
        // Parser returns immediately after reading local_deletion_time
        Ok(())
    }

    /// Write a single range tombstone bound marker.
    ///
    /// On-disk layout (must mirror the reader's `skip_range_tombstone_marker`
    /// and Cassandra's `UnfilteredSerializer.serialize(RangeTombstoneMarker)`):
    /// ```text
    /// [flags: u8]                      ← IS_MARKER (0x02)
    /// [bound_kind: u8]                 ← ClusteringPrefix.Kind ordinal
    /// [cluster_count: u16 BE]          ← bound.size()
    /// [cluster_header: VUInt]          ← only when cluster_count > 0
    /// [cluster_values: ...]
    /// [marker_body_size: VUInt]        ← size of (prev_size + deletion times)
    /// [prev_unfiltered_size: VUInt]
    /// [marked_for_delete_at: VUInt]    ← delta from min_timestamp (µs)
    /// [local_deletion_time: VUInt]     ← delta from min_local_deletion_time (s)
    /// ```
    ///
    /// Issue #717: the previous writer emitted private bound-kind ordinals,
    /// no u16 cluster count, and no marker_body_size/prev_size VInts — bytes
    /// no Cassandra (or CQLite) reader could parse.
    ///
    /// Returns the total serialized marker size (for prev_unfiltered_size
    /// threading).
    fn write_range_bound(
        &mut self,
        bound: &ClusteringBound,
        is_open: bool,
        deletion_time: i64,
        local_deletion_time: i32,
        schema: &TableSchema,
        prev_size: u64,
    ) -> Result<usize> {
        let start_len = self.buffer.len();

        // Marker flag
        self.buffer.push(IS_MARKER);

        // Bound kind (ClusteringPrefix.Kind ordinal) + clustering values.
        // Bottom/Top are the full-partition bounds: an inclusive bound with
        // zero clustering values.
        let (bound_kind, clustering) = match (is_open, bound) {
            (true, ClusteringBound::Inclusive(ck)) => (INCL_START_BOUND, Some(ck)),
            (true, ClusteringBound::Exclusive(ck)) => (EXCL_START_BOUND, Some(ck)),
            (false, ClusteringBound::Inclusive(ck)) => (INCL_END_BOUND, Some(ck)),
            (false, ClusteringBound::Exclusive(ck)) => (EXCL_END_BOUND, Some(ck)),
            (true, ClusteringBound::Bottom | ClusteringBound::Top) => (INCL_START_BOUND, None),
            (false, ClusteringBound::Bottom | ClusteringBound::Top) => (INCL_END_BOUND, None),
        };
        self.buffer.push(bound_kind);

        // Cluster count (u16 BE) — ClusteringBoundOrBoundary.Serializer
        // writes `out.writeShort(bound.size())` before the values.
        let cluster_count = clustering.map_or(0, |ck| ck.columns.len());
        if cluster_count > u16::MAX as usize {
            return Err(Error::InvalidInput(format!(
                "Range tombstone bound has too many clustering values: {}",
                cluster_count
            )));
        }
        self.buffer
            .write_all(&(cluster_count as u16).to_be_bytes())?;

        // Clustering header + values (only when the bound carries values).
        if let Some(ck) = clustering {
            self.write_clustering_prefix(ck, schema)?;
        }

        // Deletion time: Cassandra canonical order (markedForDeleteAt first,
        // then localDeletionTime), both UNSIGNED VInt deltas.
        let mut deletion = Vec::new();
        let ts_delta = (deletion_time - self.stats.min_timestamp) as u64;
        encode_unsigned(ts_delta, &mut deletion);
        let ldt_delta =
            (local_deletion_time as i64 - self.stats.min_local_deletion_time as i64) as u64;
        encode_unsigned(ldt_delta, &mut deletion);

        // marker_body_size covers the prev_size VInt + deletion times (same
        // convention as row_size for rows).
        let body_size = unsigned_len(prev_size) as u64 + deletion.len() as u64;
        encode_unsigned(body_size, &mut self.buffer);
        encode_unsigned(prev_size, &mut self.buffer);
        self.buffer.extend_from_slice(&deletion);

        Ok(self.buffer.len() - start_len)
    }

    /// Get current file position (for Index.db offset tracking).
    ///
    /// This is the total number of Data.db bytes produced so far: bytes already
    /// flushed to the sink (`position`) plus bytes currently buffered. Identical
    /// in both streaming and in-memory modes.
    pub fn position(&self) -> u64 {
        self.position + self.buffer.len() as u64
    }

    /// Length of the per-partition scratch buffer.
    ///
    /// In streaming mode this reflects only the most recently written partition
    /// (the scratch is cleared after each flush), which is the basis of the
    /// bounded-memory guarantee. Test-only accessor.
    #[cfg(test)]
    pub(crate) fn scratch_len(&self) -> usize {
        self.buffer.len()
    }

    /// Number of bytes already flushed to the streaming sink. Test-only accessor.
    #[cfg(test)]
    pub(crate) fn flushed_position(&self) -> u64 {
        self.position
    }

    fn ordered_columns<'a, F>(&self, schema: &'a TableSchema, predicate: F) -> Vec<&'a Column>
    where
        F: Fn(&Column) -> bool,
    {
        let mut columns: Vec<&'a Column> = schema
            .columns
            .iter()
            .filter(|column| predicate(column))
            .collect();
        columns.sort_by_key(|column| column_order_key(column));
        columns
    }

    fn sorted_operations<'a>(
        &self,
        mutation: &'a Mutation,
        columns: &[&Column],
    ) -> Vec<&'a crate::storage::write_engine::mutation::CellOperation> {
        let column_order: std::collections::HashMap<&str, usize> = columns
            .iter()
            .enumerate()
            .map(|(idx, column)| (column.name.as_str(), idx))
            .collect();

        let mut operations: Vec<_> = mutation.operations.iter().collect();
        operations.sort_by_key(|operation| match operation {
            crate::storage::write_engine::mutation::CellOperation::Write { column, .. }
            | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                column, ..
            }
            | crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                column_order
                    .get(column.as_str())
                    .copied()
                    .unwrap_or(usize::MAX - 1)
            }
            crate::storage::write_engine::mutation::CellOperation::DeleteRow => usize::MAX,
        });
        operations
    }

    /// Sort merged ops into regular-column serialization order
    /// (simple columns before complex, then by name).
    fn sorted_merged_ops<'a, 'b>(
        &self,
        ops: &'b [MergedOp<'a>],
        schema: &TableSchema,
    ) -> Vec<&'b MergedOp<'a>> {
        let columns = self.regular_columns(schema);
        let column_order: std::collections::HashMap<&str, usize> = columns
            .iter()
            .enumerate()
            .map(|(idx, column)| (column.name.as_str(), idx))
            .collect();

        let mut sorted: Vec<&'b MergedOp<'a>> = ops.iter().collect();
        sorted.sort_by_key(|mop| match mop.op {
            crate::storage::write_engine::mutation::CellOperation::Write { column, .. }
            | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                column, ..
            }
            | crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                column_order
                    .get(column.as_str())
                    .copied()
                    .unwrap_or(usize::MAX - 1)
            }
            crate::storage::write_engine::mutation::CellOperation::DeleteRow => usize::MAX,
        });
        sorted
    }

    fn write_column_subset(
        &self,
        buf: &mut Vec<u8>,
        columns: &[&Column],
        present_columns: &std::collections::HashSet<&str>,
    ) -> Result<()> {
        let mut present_indices = Vec::new();
        let mut missing_indices = Vec::new();

        for (idx, column) in columns.iter().enumerate() {
            if present_columns.contains(column.name.as_str()) {
                present_indices.push(idx);
            } else {
                missing_indices.push(idx);
            }
        }

        if missing_indices.is_empty() {
            encode_unsigned(0, buf);
            return Ok(());
        }

        if columns.len() < 64 {
            let mut bitmap = 0u64;
            for idx in missing_indices {
                bitmap |= 1u64 << idx;
            }
            encode_unsigned(bitmap, buf);
            return Ok(());
        }

        encode_unsigned((columns.len() - present_indices.len()) as u64, buf);

        if present_indices.len() < columns.len() / 2 {
            for idx in present_indices {
                encode_unsigned(idx as u64, buf);
            }
        } else {
            for idx in missing_indices {
                encode_unsigned(idx as u64, buf);
            }
        }

        Ok(())
    }
}

/// Returns true if the column type is a non-frozen collection (complex column).
///
/// Complex columns are stored as multiple cells with cell paths, unlike
/// frozen collections which are stored as a single cell with blob value.
/// Matches the reader logic in `v5_compressed_legacy.rs`.
fn is_complex_column(data_type: &str) -> bool {
    let dt = data_type.to_lowercase();

    // Frozen collections are NOT complex (they're single-cell frozen types)
    if dt.starts_with("frozen<") || dt.starts_with("org.apache.cassandra.db.marshal.frozentype(") {
        return false;
    }

    // CQL-style collection types
    if dt.starts_with("list<") || dt.starts_with("set<") || dt.starts_with("map<") {
        return true;
    }

    // Cassandra internal collection types
    if dt.starts_with("org.apache.cassandra.db.marshal.listtype(")
        || dt.starts_with("org.apache.cassandra.db.marshal.settype(")
        || dt.starts_with("org.apache.cassandra.db.marshal.maptype(")
    {
        return true;
    }

    false
}

/// A surviving cell operation in a merged row, tagged with the timestamp and
/// row-level TTL of the mutation it came from.
struct MergedOp<'a> {
    op: &'a crate::storage::write_engine::mutation::CellOperation,
    timestamp_micros: i64,
    /// Row-level TTL (`Mutation::ttl_seconds`) of the originating mutation.
    /// Per-cell TTL lives inside `CellOperation::WriteWithTtl` itself.
    row_ttl_seconds: Option<u32>,
}

/// One Data.db row assembled by merging every mutation of a partition that
/// shares the same clustering key (Issues #716/#717: a partition must never
/// contain two rows with equal clustering).
struct RowWrite<'a> {
    clustering_key: Option<&'a crate::storage::write_engine::mutation::ClusteringKey>,
    /// Primary-key liveness timestamp. `None` for pure row tombstones —
    /// Cassandra serializes those without HAS_TIMESTAMP.
    liveness_ts: Option<i64>,
    /// Row-level TTL from the liveness-providing mutation.
    ttl_seconds: Option<u32>,
    /// Row deletion as (marked_for_delete_at µs, local_deletion_time s).
    row_deletion: Option<(i64, i32)>,
    /// Surviving cell operations (already reconciled, unsorted).
    ops: Vec<MergedOp<'a>>,
}

fn column_order_key(column: &Column) -> (bool, &str) {
    (is_complex_column(&column.data_type), column.name.as_str())
}

/// Generate a version-1 TimeUUID for use as a list cell path.
///
/// List elements in Cassandra use TimeUUIDs as cell paths to maintain insertion order.
/// Each call with a different `element_index` produces a monotonically increasing UUID.
///
/// # Arguments
/// * `timestamp_micros` - Mutation timestamp in microseconds since Unix epoch
/// * `element_index` - Index of the element within the list (for monotonic ordering)
fn generate_list_cell_path_timeuuid(timestamp_micros: i64, element_index: u64) -> [u8; 16] {
    // UUID v1 timestamp: 100-nanosecond intervals since UUID epoch (Oct 15, 1582)
    // Offset from Unix epoch to UUID epoch in 100-ns units
    const UUID_EPOCH_OFFSET: u64 = 0x01B2_1DD2_1381_4000;

    let ts_100ns = (timestamp_micros as u64) * 10 + element_index;
    let uuid_ts = ts_100ns + UUID_EPOCH_OFFSET;

    // Extract time fields per RFC 4122
    let time_low = (uuid_ts & 0xFFFF_FFFF) as u32;
    let time_mid = ((uuid_ts >> 32) & 0xFFFF) as u16;
    let time_hi = ((uuid_ts >> 48) & 0x0FFF) as u16 | 0x1000; // version 1

    // Fixed clock_seq and node for deterministic output
    let clock_seq: u16 = 0x80; // variant bits (10xx) + seq=0
    let node: [u8; 6] = [0x00, 0x00, 0x00, 0x00, 0x00, 0x00];

    let mut uuid = [0u8; 16];
    uuid[0..4].copy_from_slice(&time_low.to_be_bytes());
    uuid[4..6].copy_from_slice(&time_mid.to_be_bytes());
    uuid[6..8].copy_from_slice(&time_hi.to_be_bytes());
    uuid[8] = (clock_seq >> 8) as u8;
    uuid[9] = (clock_seq & 0xFF) as u8;
    uuid[10..16].copy_from_slice(&node);

    uuid
}

/// Convert a usize length to i32 for Cassandra's collection wire format.
/// Returns an error if the length exceeds i32::MAX.
fn len_as_i32(len: usize) -> Result<i32> {
    i32::try_from(len).map_err(|_| {
        Error::InvalidInput(format!(
            "Length {} exceeds maximum i32 for collection encoding",
            len
        ))
    })
}

/// Serialize a collection element, rejecting null (CQL semantics: lists/sets cannot contain null).
fn serialize_collection_element(value: &Value, collection_kind: &str) -> Result<Vec<u8>> {
    if matches!(value, Value::Null) {
        return Err(Error::InvalidInput(format!(
            "{} elements cannot be null (CQL semantics)",
            collection_kind
        )));
    }
    serialize_value(value)
}

/// Serialize a Value to bytes for cell storage
///
/// This follows Cassandra's type-specific serialization rules.
fn serialize_value(value: &Value) -> Result<Vec<u8>> {
    match value {
        Value::Null => Ok(Vec::new()),
        Value::Boolean(b) => Ok(vec![if *b { 1 } else { 0 }]),
        Value::TinyInt(n) => Ok(vec![*n as u8]),
        Value::SmallInt(n) => Ok(n.to_be_bytes().to_vec()),
        Value::Integer(n) => Ok(n.to_be_bytes().to_vec()),
        Value::BigInt(n) => Ok(n.to_be_bytes().to_vec()),
        Value::Counter(n) => Ok(n.to_be_bytes().to_vec()),
        Value::Float32(f) => Ok(f.to_bits().to_be_bytes().to_vec()),
        Value::Float(f) => Ok(f.to_bits().to_be_bytes().to_vec()),
        Value::Text(s) => Ok(s.as_bytes().to_vec()),
        Value::Blob(bytes) => Ok(bytes.clone()),
        Value::Timestamp(millis) => Ok(millis.to_be_bytes().to_vec()),
        Value::Date(days) => {
            // Cassandra DATE: stored as unsigned int with Integer.MIN_VALUE offset
            let stored = days.wrapping_sub(i32::MIN) as u32;
            Ok(stored.to_be_bytes().to_vec())
        }
        Value::Time(nanos) => Ok(nanos.to_be_bytes().to_vec()),
        Value::Uuid(bytes) => Ok(bytes.to_vec()),
        Value::Inet(bytes) => Ok(bytes.clone()),
        Value::Varint(bytes) => Ok(bytes.clone()),
        Value::Decimal { scale, unscaled } => {
            let mut result = Vec::new();
            result.extend_from_slice(&scale.to_be_bytes());
            result.extend_from_slice(unscaled);
            Ok(result)
        }
        Value::Duration {
            months,
            days,
            nanos,
        } => {
            let mut result = Vec::new();
            // Cassandra DurationType stores three signed VInts, not fixed-width ints.
            encode_signed(*months as i64, &mut result);
            encode_signed(*days as i64, &mut result);
            encode_signed(*nanos, &mut result);
            Ok(result)
        }
        Value::Udt(udt_value) => {
            // Construct UdtTypeDef from UdtValue fields by inferring types
            let mut schema =
                UdtTypeDef::new(udt_value.keyspace.clone(), udt_value.type_name.clone());

            // Infer field types from values
            for field in &udt_value.fields {
                let field_type = infer_cql_type_from_value(field.value.as_ref());
                schema = schema.with_field(field.name.clone(), field_type, true);
            }

            let serializer = TypeSerializer::new();
            serializer.serialize_udt(value, &schema)
        }
        Value::List(elements) | Value::Set(elements) => {
            let mut buf = Vec::new();
            buf.extend_from_slice(&len_as_i32(elements.len())?.to_be_bytes());
            for elem in elements {
                let elem_bytes = serialize_collection_element(elem, "Collection")?;
                buf.extend_from_slice(&len_as_i32(elem_bytes.len())?.to_be_bytes());
                buf.extend_from_slice(&elem_bytes);
            }
            Ok(buf)
        }
        Value::Map(entries) => {
            let mut buf = Vec::new();
            buf.extend_from_slice(&len_as_i32(entries.len())?.to_be_bytes());
            for (key, val) in entries {
                if matches!(key, Value::Null) {
                    return Err(Error::InvalidInput(
                        "MAP keys cannot be null (CQL semantics)".to_string(),
                    ));
                }
                let key_bytes = serialize_value(key)?;
                buf.extend_from_slice(&len_as_i32(key_bytes.len())?.to_be_bytes());
                buf.extend_from_slice(&key_bytes);
                let val_bytes = serialize_value(val)?;
                buf.extend_from_slice(&len_as_i32(val_bytes.len())?.to_be_bytes());
                buf.extend_from_slice(&val_bytes);
            }
            Ok(buf)
        }
        Value::Tuple(fields) => {
            let mut buf = Vec::new();
            for field in fields {
                match field {
                    Value::Null => buf.extend_from_slice(&(-1i32).to_be_bytes()),
                    other => {
                        let field_bytes = serialize_value(other)?;
                        buf.extend_from_slice(&len_as_i32(field_bytes.len())?.to_be_bytes());
                        buf.extend_from_slice(&field_bytes);
                    }
                }
            }
            Ok(buf)
        }
        Value::Frozen(inner) => serialize_value(inner),
        _ => Err(Error::InvalidInput(format!(
            "Unsupported value type for serialization: {:?}",
            value
        ))),
    }
}

/// Infer CQL type from a Value instance
///
/// Used for UDT serialization when schema context is not available.
/// Empty collections still fall back to `text` because there is no element
/// value available to inspect.
fn infer_cql_type_from_value(value: Option<&Value>) -> CqlType {
    match value {
        None | Some(Value::Null) => CqlType::Text, // Default for NULL
        Some(Value::Boolean(_)) => CqlType::Boolean,
        Some(Value::TinyInt(_)) => CqlType::TinyInt,
        Some(Value::SmallInt(_)) => CqlType::SmallInt,
        Some(Value::Integer(_)) => CqlType::Int,
        Some(Value::BigInt(_)) => CqlType::BigInt,
        Some(Value::Float32(_)) => CqlType::Float,
        Some(Value::Float(_)) => CqlType::Double,
        Some(Value::Text(_)) => CqlType::Text,
        Some(Value::Blob(_)) => CqlType::Blob,
        Some(Value::Timestamp(_)) => CqlType::Timestamp,
        Some(Value::Date(_)) => CqlType::Date,
        Some(Value::Time(_)) => CqlType::Time,
        Some(Value::Uuid(_)) => CqlType::Uuid,
        Some(Value::Inet(_)) => CqlType::Inet,
        Some(Value::Varint(_)) => CqlType::Varint,
        Some(Value::Decimal { .. }) => CqlType::Decimal,
        Some(Value::Duration { .. }) => CqlType::Duration,
        Some(Value::Counter(_)) => CqlType::Counter,
        Some(Value::List(elements)) => CqlType::List(Box::new(
            elements
                .first()
                .map(|elem| infer_cql_type_from_value(Some(elem)))
                .unwrap_or(CqlType::Text),
        )),
        Some(Value::Set(elements)) => CqlType::Set(Box::new(
            elements
                .first()
                .map(|elem| infer_cql_type_from_value(Some(elem)))
                .unwrap_or(CqlType::Text),
        )),
        Some(Value::Map(entries)) => {
            let (key_type, value_type) = entries
                .first()
                .map(|(key, value)| {
                    (
                        infer_cql_type_from_value(Some(key)),
                        infer_cql_type_from_value(Some(value)),
                    )
                })
                .unwrap_or((CqlType::Text, CqlType::Text));
            CqlType::Map(Box::new(key_type), Box::new(value_type))
        }
        Some(Value::Tuple(fields)) => CqlType::Tuple(
            fields
                .iter()
                .map(|field| infer_cql_type_from_value(Some(field)))
                .collect(),
        ),
        Some(Value::Udt(udt)) => CqlType::Udt(
            udt.type_name.clone(),
            udt.fields
                .iter()
                .map(|field| {
                    (
                        field.name.clone(),
                        infer_cql_type_from_value(field.value.as_ref()),
                    )
                })
                .collect(),
        ),
        Some(Value::Frozen(inner)) => {
            CqlType::Frozen(Box::new(infer_cql_type_from_value(Some(inner))))
        }
        Some(Value::Tombstone(_)) => CqlType::Text, // Tombstones shouldn't appear in UDT fields
        Some(Value::Json(_)) => CqlType::Text,      // JSON is stored as text
    }
}

fn cell_value_uses_length_prefix(value: &Value) -> bool {
    !matches!(
        value,
        Value::Boolean(_)
            | Value::Integer(_)
            | Value::BigInt(_)
            | Value::Float32(_)
            | Value::Float(_)
            | Value::Timestamp(_)
            | Value::Uuid(_)
    )
}

fn is_static_row_mutation(mutation: &Mutation, schema: &TableSchema) -> bool {
    if mutation.clustering_key.is_some() || !schema.columns.iter().any(|column| column.is_static) {
        return false;
    }

    mutation.operations.iter().all(|operation| match operation {
        crate::storage::write_engine::mutation::CellOperation::Write { column, .. }
        | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl { column, .. }
        | crate::storage::write_engine::mutation::CellOperation::Delete { column } => schema
            .columns
            .iter()
            .find(|candidate| candidate.name == *column)
            .map(|candidate| candidate.is_static)
            .unwrap_or(false),
        crate::storage::write_engine::mutation::CellOperation::DeleteRow => true,
    })
}

/// Returns true if this single operation targets a static column.
fn is_static_operation(
    op: &crate::storage::write_engine::mutation::CellOperation,
    schema: &TableSchema,
) -> bool {
    match op {
        crate::storage::write_engine::mutation::CellOperation::Write { column, .. }
        | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl { column, .. }
        | crate::storage::write_engine::mutation::CellOperation::Delete { column } => schema
            .columns
            .iter()
            .find(|c| c.name == *column)
            .map(|c| c.is_static)
            .unwrap_or(false),
        crate::storage::write_engine::mutation::CellOperation::DeleteRow => false,
    }
}

/// Returns true if this mutation contributes at least one static-column operation.
fn has_static_operation(mutation: &Mutation, schema: &TableSchema) -> bool {
    mutation
        .operations
        .iter()
        .any(|op| is_static_operation(op, schema))
}

/// Collect and merge static-column operations from all mutations in a partition.
///
/// Scans every mutation (regardless of whether it has a clustering key) and
/// collects operations that target static columns.  Last-write-wins by
/// `timestamp_micros` when the same column is written more than once.
///
/// Mutations at or before `shadow_floor` (the partition tombstone's deletion
/// timestamp) are skipped: their static cells are shadowed and an sstable
/// must be internally reconciled (see `DataWriter::write_partition`).
///
/// Returns the merged operations in an unspecified order (the writer will
/// sort them by schema column order when building the row body).
fn collect_static_operations(
    mutations: &[Mutation],
    schema: &TableSchema,
    shadow_floor: Option<i64>,
) -> Vec<crate::storage::write_engine::mutation::CellOperation> {
    use std::collections::HashMap;

    // Map: column_name → (timestamp, operation)
    let mut best: HashMap<String, (i64, crate::storage::write_engine::mutation::CellOperation)> =
        HashMap::new();

    for mutation in mutations {
        if shadow_floor.is_some_and(|floor| mutation.timestamp_micros <= floor) {
            continue;
        }
        for op in &mutation.operations {
            if !is_static_operation(op, schema) {
                continue;
            }
            let col_name = match op {
                crate::storage::write_engine::mutation::CellOperation::Write { column, .. }
                | crate::storage::write_engine::mutation::CellOperation::WriteWithTtl {
                    column,
                    ..
                }
                | crate::storage::write_engine::mutation::CellOperation::Delete { column } => {
                    column.clone()
                }
                crate::storage::write_engine::mutation::CellOperation::DeleteRow => continue,
            };
            let entry = best.entry(col_name).or_insert((i64::MIN, op.clone()));
            if mutation.timestamp_micros >= entry.0 {
                *entry = (mutation.timestamp_micros, op.clone());
            }
        }
    }

    best.into_values().map(|(_, op)| op).collect()
}

/// Whether a range tombstone's clustering range covers the given clustering key.
fn range_tombstone_covers(
    rt: &RangeTombstone,
    clustering_key: Option<&ClusteringKey>,
    schema: &TableSchema,
) -> bool {
    use std::cmp::Ordering;

    let Some(ck) = clustering_key else {
        return false;
    };
    let cmp = |bound: &ClusteringKey| ck.compare(bound, schema).unwrap_or_else(|_| ck.cmp(bound));

    let after_start = match &rt.start {
        ClusteringBound::Inclusive(b) => cmp(b) != Ordering::Less,
        ClusteringBound::Exclusive(b) => cmp(b) == Ordering::Greater,
        ClusteringBound::Bottom => true,
        ClusteringBound::Top => false,
    };
    let before_end = match &rt.end {
        ClusteringBound::Inclusive(b) => cmp(b) != Ordering::Greater,
        ClusteringBound::Exclusive(b) => cmp(b) == Ordering::Less,
        ClusteringBound::Top => true,
        ClusteringBound::Bottom => false,
    };
    after_start && before_end
}

/// Serialize value for clustering key (type-specific encoding)
///
/// Fixed-width types: raw bytes (no length prefix)
/// Variable-width types: VInt length + bytes
fn serialize_value_for_clustering(value: &Value, comparator: &ComparatorType) -> Result<Vec<u8>> {
    match (value, comparator) {
        // Fixed-width types (no length prefix)
        (Value::Boolean(b), ComparatorType::Boolean) => Ok(vec![if *b { 1 } else { 0 }]),
        (Value::TinyInt(n), ComparatorType::TinyInt) => Ok(n.to_be_bytes().to_vec()),
        (Value::SmallInt(n), ComparatorType::SmallInt) => Ok(n.to_be_bytes().to_vec()),
        (Value::Integer(n), ComparatorType::Int) => Ok(n.to_be_bytes().to_vec()),
        (Value::BigInt(n), ComparatorType::BigInt) => Ok(n.to_be_bytes().to_vec()),
        (Value::Float32(f), ComparatorType::Float32) => Ok(f.to_bits().to_be_bytes().to_vec()),
        (Value::Float(f), ComparatorType::Float) => Ok(f.to_bits().to_be_bytes().to_vec()),
        (Value::Timestamp(millis), ComparatorType::Timestamp) => Ok(millis.to_be_bytes().to_vec()),
        (Value::Date(days), ComparatorType::Date) => {
            // Cassandra DATE in clustering keys: stored as unsigned int with Integer.MIN_VALUE offset
            let stored = days.wrapping_sub(i32::MIN) as u32;
            let mut result = Vec::new();
            encode_unsigned(4, &mut result);
            result.extend_from_slice(&stored.to_be_bytes());
            Ok(result)
        }
        (Value::Uuid(bytes), ComparatorType::Uuid) => Ok(bytes.to_vec()),

        // Variable-width types (VInt length + bytes)
        (Value::Text(s), ComparatorType::Text) => {
            let bytes = s.as_bytes();
            let mut result = Vec::new();
            encode_unsigned(bytes.len() as u64, &mut result);
            result.extend_from_slice(bytes);
            Ok(result)
        }
        (Value::Blob(bytes), ComparatorType::Blob) => {
            let mut result = Vec::new();
            encode_unsigned(bytes.len() as u64, &mut result);
            result.extend_from_slice(bytes);
            Ok(result)
        }

        // Frozen collections as clustering keys: serialize the full collection bytes with VInt length prefix
        (Value::Frozen(inner), _) => {
            let bytes = serialize_value(inner)?;
            let mut result = Vec::new();
            encode_unsigned(bytes.len() as u64, &mut result);
            result.extend_from_slice(&bytes);
            Ok(result)
        }

        _ => Err(Error::InvalidInput(format!(
            "Type mismatch or unsupported clustering type: value={:?}, comparator={:?}",
            value, comparator
        ))),
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::schema::{
        ClusteringColumn, ClusteringOrder, Column, CqlType, KeyColumn, TableSchema,
    };
    use crate::storage::serialization::types::TypeSerializer;
    use crate::storage::write_engine::mutation::{
        CellOperation, ClusteringKey, PartitionKey, TableId,
    };
    use crate::types::UdtValue;
    use std::collections::HashMap;

    fn create_test_schema() -> TableSchema {
        TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![
                Column {
                    name: "name".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "age".to_string(),
                    data_type: "int".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        }
    }

    fn create_test_stats() -> StatisticsMetadata {
        let mut stats = StatisticsMetadata::new();
        stats.min_timestamp = 1000000;
        stats.min_ttl = 0;
        stats.min_local_deletion_time = 0;
        stats
    }

    fn phase3_address_schema() -> UdtTypeDef {
        UdtTypeDef::new("test_ks".to_string(), "address".to_string())
            .with_field("street".to_string(), CqlType::Text, true)
            .with_field("city".to_string(), CqlType::Text, true)
    }

    fn phase3_person_schema() -> UdtTypeDef {
        UdtTypeDef::new("test_ks".to_string(), "person".to_string())
            .with_field("name".to_string(), CqlType::Text, true)
            .with_field(
                "phone_numbers".to_string(),
                CqlType::List(Box::new(CqlType::Frozen(Box::new(CqlType::Udt(
                    "phone_number".to_string(),
                    vec![],
                ))))),
                true,
            )
            .with_field(
                "home_address".to_string(),
                CqlType::Frozen(Box::new(CqlType::Udt("address".to_string(), vec![]))),
                true,
            )
    }

    fn phase3_company_schema() -> UdtTypeDef {
        UdtTypeDef::new("test_ks".to_string(), "company".to_string())
            .with_field("name".to_string(), CqlType::Text, true)
            .with_field(
                "employees".to_string(),
                CqlType::List(Box::new(CqlType::Frozen(Box::new(CqlType::Udt(
                    "person".to_string(),
                    vec![],
                ))))),
                true,
            )
            .with_field(
                "departments".to_string(),
                CqlType::Map(
                    Box::new(CqlType::Text),
                    Box::new(CqlType::Frozen(Box::new(CqlType::List(Box::new(
                        CqlType::Frozen(Box::new(CqlType::Udt("person".to_string(), vec![]))),
                    ))))),
                ),
                true,
            )
    }

    fn phase3_address_value() -> UdtValue {
        UdtValue::new("address".to_string(), "test_ks".to_string())
            .with_field(
                "street".to_string(),
                Some(Value::Text("Main St".to_string())),
            )
            .with_field("city".to_string(), Some(Value::Text("Seattle".to_string())))
    }

    fn phase3_phone_value() -> UdtValue {
        UdtValue::new("phone_number".to_string(), "test_ks".to_string())
            .with_field("label".to_string(), Some(Value::Text("mobile".to_string())))
            .with_field(
                "number".to_string(),
                Some(Value::Text("+1-555-0101".to_string())),
            )
    }

    fn phase3_person_value(name: &str) -> UdtValue {
        UdtValue::new("person".to_string(), "test_ks".to_string())
            .with_field("name".to_string(), Some(Value::Text(name.to_string())))
            .with_field(
                "phone_numbers".to_string(),
                Some(Value::List(vec![Value::Frozen(Box::new(Value::Udt(
                    phase3_phone_value(),
                )))])),
            )
            .with_field(
                "home_address".to_string(),
                Some(Value::Frozen(Box::new(Value::Udt(phase3_address_value())))),
            )
    }

    fn phase3_company_value() -> UdtValue {
        let person = phase3_person_value("Alice");
        UdtValue::new("company".to_string(), "test_ks".to_string())
            .with_field("name".to_string(), Some(Value::Text("Acme".to_string())))
            .with_field(
                "employees".to_string(),
                Some(Value::List(vec![Value::Frozen(Box::new(Value::Udt(
                    person.clone(),
                )))])),
            )
            .with_field(
                "departments".to_string(),
                Some(Value::Map(vec![(
                    Value::Text("platform".to_string()),
                    Value::Frozen(Box::new(Value::List(vec![Value::Frozen(Box::new(
                        Value::Udt(person),
                    ))]))),
                )])),
            )
    }

    fn create_static_test_schema() -> TableSchema {
        TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![ClusteringColumn {
                name: "ck".to_string(),
                data_type: "int".to_string(),
                position: 0,
                order: ClusteringOrder::Asc,
            }],
            columns: vec![
                Column {
                    name: "static_val".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: true,
                },
                Column {
                    name: "regular_val".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        }
    }

    #[test]
    fn test_data_writer_new() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);
        assert_eq!(writer.position(), 0);
    }

    #[test]
    fn test_write_partition_header() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let key = DecoratedKey::new(12345, vec![0x00, 0x00, 0x00, 0x2A]); // int = 42
        writer.write_partition_header(&key, None).unwrap();

        let bytes = writer.finish().unwrap();

        // Verify structure (Cassandra BigFormat):
        // [0x00, 0x04] key length (u16 BE = 4 bytes)
        // [0x00, 0x00, 0x00, 0x2A] key bytes
        // [0x7F, 0xFF, 0xFF, 0xFF] DeletionTime.LIVE local_deletion_time (i32::MAX)
        // [0x80, 0x00...] DeletionTime.LIVE deletion_timestamp (i64::MIN)
        assert_eq!(&bytes[0..2], &[0x00, 0x04]); // key length (u16 BE)
        assert_eq!(&bytes[2..6], &[0x00, 0x00, 0x00, 0x2A]); // key bytes
        assert_eq!(&bytes[6..10], &i32::MAX.to_be_bytes()); // DeletionTime.LIVE ldt
        assert_eq!(&bytes[10..18], &i64::MIN.to_be_bytes()); // DeletionTime.LIVE ts
    }

    #[test]
    fn test_write_simple_row() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::Write {
                    column: "age".to_string(),
                    value: Value::Integer(30),
                },
            ],
            1001000, // timestamp (delta = 1000)
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify row flags
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_TIMESTAMP,
            ROW_HAS_TIMESTAMP,
            "Should have timestamp"
        );
        assert_eq!(
            flags & ROW_HAS_ALL_COLUMNS,
            ROW_HAS_ALL_COLUMNS,
            "Should have all columns"
        );
    }

    #[test]
    fn test_write_row_with_clustering() {
        let mut schema = create_test_schema();
        schema.clustering_keys = vec![ClusteringColumn {
            name: "ts".to_string(),
            data_type: "timestamp".to_string(),
            position: 0,
            order: ClusteringOrder::Asc,
        }];

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let ck = ClusteringKey::single("ts", Value::Timestamp(1234567890));
        let mutation = Mutation::new(
            table_id,
            pk,
            Some(ck),
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Bob".to_string()),
            }],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify row has flags and clustering prefix
        let flags = bytes[0];
        assert_eq!(flags & ROW_HAS_TIMESTAMP, ROW_HAS_TIMESTAMP);
    }

    #[test]
    fn test_write_partition_complete() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let key = DecoratedKey::new(12345, vec![0x00, 0x00, 0x00, 0x01]);
        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        let mutations = vec![
            Mutation::new(
                table_id.clone(),
                pk.clone(),
                None,
                vec![CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                }],
                1001000,
                None,
            ),
            Mutation::new(
                table_id,
                pk,
                None,
                vec![CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Bob".to_string()),
                }],
                1002000,
                None,
            ),
        ];

        let offset = writer
            .write_partition(&key, &mutations, &schema, None, &[])
            .unwrap();
        assert_eq!(offset, 0); // First partition starts at offset 0

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify end-of-partition marker is present
        assert_eq!(bytes[bytes.len() - 1], END_OF_PARTITION);
    }

    /// Regression test for bug #644 (S6): temporal deltas MUST use unsigned VInt.
    ///
    /// The writer previously used ZigZag-encoded signed VInt (`encode_signed`) for
    /// all row-header temporal deltas (timestamp, TTL, LDT).  ZigZag maps positive
    /// integer n → 2n, so a delta of 5000 would be encoded as 10000, which the
    /// reader (fixed in S1, using `parse_vuint` = unsigned VInt) would decode as
    /// 10000 — doubling every timestamp on readback.
    ///
    /// Per Cassandra `SerializationHeader.java:167`:
    ///   `out.writeUnsignedVInt(timestamp - stats.minTimestamp)`
    ///   `out.writeUnsignedVInt(ttl - stats.minTTL)`
    ///   `out.writeUnsignedVInt(localDeletionTime - stats.minLocalDeletionTime)`
    ///
    /// Expected encodings (2-byte unsigned VInt, Cassandra format: leading 1-bits + data):
    ///   unsigned VInt(5000 = 0x1388):
    ///     extra_bytes=1, first=(0x80 | (0x1388>>8)&0x3F)=0x93, second=0x88  → [0x93, 0x88]
    ///     ZigZag(5000)=10000 would give [0xA7, 0x10]  ← WRONG (pre-fix value)
    ///
    ///   unsigned VInt(3600 = 0x0E10):
    ///     extra_bytes=1, first=(0x80 | (0x0E10>>8)&0x3F)=0x8E, second=0x10  → [0x8E, 0x10]
    ///     ZigZag(3600)=7200 would give [0x9C, 0x20]  ← WRONG (pre-fix value)
    #[test]
    fn test_delta_encoding_unsigned_vint_fix_644() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1_000_000;
        stats.min_ttl = 3_600;
        stats.min_local_deletion_time = 0;

        let writer = DataWriter::new(stats.clone());
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Test".to_string()),
            }],
            1_005_000,  // timestamp_micros; delta from min_timestamp(1_000_000) = 5_000
            Some(7200), // ttl; delta from min_ttl(3_600) = 3_600
        );

        let row_body = writer
            .build_row_body(&mutation, &schema, ROW_HAS_TIMESTAMP | ROW_HAS_TTL)
            .unwrap();
        assert!(!row_body.is_empty(), "row body must be non-empty");

        // The row body for HAS_TIMESTAMP | HAS_TTL starts with:
        //   [0..2] timestamp delta as unsigned VInt
        //   [2..4] ttl delta as unsigned VInt
        //   [4..]  ldt delta as unsigned VInt (time-dependent, not asserted)
        //   ...    column bitmap, cells
        //
        // timestamp_delta = 5000 → unsigned VInt = [0x93, 0x88]
        // ZigZag(5000) = 10000 → would give [0xA7, 0x10]  ← OLD/WRONG pre-fix encoding
        assert_eq!(
            &row_body[0..2],
            &[0x93u8, 0x88u8],
            "Fix #644: timestamp delta=5000 must encode as unsigned VInt [0x93, 0x88], \
             not ZigZag [0xA7, 0x10]. Reader uses parse_vuint (unsigned), so ZigZag would \
             double the delta on readback (5000 → decoded as 10000)."
        );

        // ttl_delta = 7200 - 3600 = 3600 → unsigned VInt = [0x8E, 0x10]
        // ZigZag(3600) = 7200 → would give [0x9C, 0x20]  ← OLD/WRONG pre-fix encoding
        assert_eq!(
            &row_body[2..4],
            &[0x8Eu8, 0x10u8],
            "Fix #644: TTL delta=3600 must encode as unsigned VInt [0x8E, 0x10], \
             not ZigZag [0x9C, 0x20]. This is the first of two HAS_TTL fields."
        );
    }

    #[test]
    fn test_delta_encoding() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_ttl = 3600;

        let writer = DataWriter::new(stats.clone());
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Test".to_string()),
            }],
            1005000,    // timestamp (delta = 5000)
            Some(7200), // TTL (delta = 3600)
        );

        let row_body = writer
            .build_row_body(&mutation, &schema, ROW_HAS_TIMESTAMP | ROW_HAS_TTL)
            .unwrap();
        assert!(!row_body.is_empty());
    }

    #[test]
    fn test_serialize_value_types() {
        // Boolean
        let bytes = serialize_value(&Value::Boolean(true)).unwrap();
        assert_eq!(bytes, vec![1]);

        // Integer
        let bytes = serialize_value(&Value::Integer(42)).unwrap();
        assert_eq!(bytes, vec![0x00, 0x00, 0x00, 0x2A]);

        // Text
        let bytes = serialize_value(&Value::Text("hello".to_string())).unwrap();
        assert_eq!(bytes, b"hello");

        // BigInt
        let bytes = serialize_value(&Value::BigInt(9223372036854775807)).unwrap();
        assert_eq!(bytes, vec![0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF]);

        // Null
        let bytes = serialize_value(&Value::Null).unwrap();
        assert_eq!(bytes, Vec::<u8>::new());
    }

    #[test]
    fn test_column_bitmap() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Only write "name" column (not "age")
        // Schema has 2 regular columns sorted alphabetically: [age(0), name(1)]
        // "age" is MISSING → bitmap bit 0 set → bitmap = 0b01 = 1
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Alice".to_string()),
            }],
            1001000,
            None,
        );

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // Cassandra format: single VUInt of missing columns bitmask
        // "age" (index 0) is missing → bitmap = 0x01
        assert_eq!(buf, vec![0x01]);
    }

    #[test]
    fn test_partition_key_size_limit() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        // 256 bytes should succeed (u16 allows up to 65535)
        let key_256 = vec![0xFF; 256];
        let key = DecoratedKey::new(12345, key_256);
        let result = writer.write_partition_header(&key, None);
        assert!(result.is_ok());

        // Create a partition key larger than 65535 bytes
        let mut writer2 = DataWriter::new(create_test_stats());
        let large_key = vec![0xFF; 65536];
        let key = DecoratedKey::new(12345, large_key);

        let result = writer2.write_partition_header(&key, None);
        assert!(result.is_err());
        assert!(result.unwrap_err().to_string().contains("too large"));
    }

    #[test]
    fn test_write_tombstone_cell() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000; // Jan 2023
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        let timestamp = 1001000; // delta = 1000
        let local_deletion_time = 1700000010; // delta = 10
        writer
            .write_tombstone_cell(&mut buf, "deleted_col", timestamp, local_deletion_time)
            .unwrap();

        assert!(!buf.is_empty());
        // First byte should be tombstone flags (only IS_DELETED, no USE_ROW_TIMESTAMP)
        let flags = buf[0];
        assert_eq!(
            flags & CELL_IS_DELETED,
            CELL_IS_DELETED,
            "Should have IS_DELETED flag"
        );
        assert_eq!(
            flags & CELL_USE_ROW_TIMESTAMP,
            0,
            "Should NOT have USE_ROW_TIMESTAMP flag"
        );

        // Should have timestamp delta and local_deletion_time delta encoded as VInts
        assert!(
            buf.len() > 1,
            "Should have timestamp and deletion_time deltas"
        );
    }

    #[test]
    fn test_serialize_clustering_value_fixed_width() {
        // Integer (fixed-width, no length prefix)
        let bytes =
            serialize_value_for_clustering(&Value::Integer(42), &ComparatorType::Int).unwrap();
        assert_eq!(bytes, vec![0x00, 0x00, 0x00, 0x2A]);

        // BigInt (fixed-width)
        let bytes =
            serialize_value_for_clustering(&Value::BigInt(1000), &ComparatorType::BigInt).unwrap();
        assert_eq!(bytes, vec![0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xE8]);
    }

    #[test]
    fn test_serialize_clustering_value_variable_width() {
        // Text (variable-width, VInt length prefix)
        let bytes =
            serialize_value_for_clustering(&Value::Text("test".to_string()), &ComparatorType::Text)
                .unwrap();
        assert!(!bytes.is_empty());
        // First byte(s) should be VInt length (4), followed by "test"
        // VInt(4) = 0x04, then "test"
        assert_eq!(bytes[0], 0x04); // VInt length = 4
        assert_eq!(&bytes[1..], b"test");
    }

    #[test]
    fn test_serialize_clustering_date_includes_length_prefix() {
        let bytes = serialize_value_for_clustering(&Value::Date(0), &ComparatorType::Date).unwrap();
        assert_eq!(
            bytes[0], 0x04,
            "date clustering values should be length-prefixed"
        );
        assert_eq!(
            bytes.len(),
            5,
            "date clustering value should be 1-byte length + 4-byte payload"
        );
    }

    #[test]
    fn test_serialize_clustering_frozen_list_text() {
        let value = Value::Frozen(Box::new(Value::List(vec![Value::Text("solo".to_string())])));
        let comparator = ComparatorType::Frozen(Box::new(ComparatorType::List(Box::new(
            ComparatorType::Text,
        ))));

        let bytes = serialize_value_for_clustering(&value, &comparator).unwrap();
        let expected_inner =
            serialize_value(&Value::List(vec![Value::Text("solo".to_string())])).unwrap();

        let mut expected = vec![expected_inner.len() as u8];
        expected.extend_from_slice(&expected_inner);

        assert_eq!(bytes, expected);
    }

    #[test]
    fn test_null_vs_empty_string() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Test NULL - should not be written as a cell
        let result = writer.write_cell(&mut Vec::new(), "test_col", &Value::Null, 1001000);
        assert!(result.is_err(), "NULL values should return error");
        assert!(result
            .unwrap_err()
            .to_string()
            .contains("NULL values should not be written"));

        // Test empty string - should have HAS_EMPTY_VALUE flag
        let mut buf = Vec::new();
        writer
            .write_cell(&mut buf, "test_col", &Value::Text(String::new()), 1001000)
            .unwrap();

        assert!(!buf.is_empty());
        let flags = buf[0];
        assert_eq!(
            flags & CELL_HAS_EMPTY_VALUE,
            CELL_HAS_EMPTY_VALUE,
            "Empty string should have HAS_EMPTY_VALUE flag"
        );

        // Test non-empty string - should NOT have HAS_EMPTY_VALUE flag
        let mut buf2 = Vec::new();
        writer
            .write_cell(
                &mut buf2,
                "test_col",
                &Value::Text("test".to_string()),
                1001000,
            )
            .unwrap();

        let flags2 = buf2[0];
        assert_eq!(
            flags2 & CELL_HAS_EMPTY_VALUE,
            0,
            "Non-empty string should NOT have HAS_EMPTY_VALUE flag"
        );

        assert_eq!(buf, vec![CELL_USE_ROW_TIMESTAMP | CELL_HAS_EMPTY_VALUE]);
    }

    #[test]
    fn test_fixed_width_cell_omits_length_prefix() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);
        let mut buf = Vec::new();

        writer
            .write_cell(&mut buf, "value", &Value::Integer(42), 1001000)
            .unwrap();

        assert_eq!(buf, vec![CELL_USE_ROW_TIMESTAMP, 0x00, 0x00, 0x00, 0x2A]);
    }

    #[test]
    fn test_variable_width_cell_keeps_length_prefix() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);
        let mut buf = Vec::new();

        writer
            .write_cell(&mut buf, "value", &Value::Text("abc".to_string()), 1001000)
            .unwrap();

        assert_eq!(buf, vec![CELL_USE_ROW_TIMESTAMP, 0x03, b'a', b'b', b'c']);
    }

    #[test]
    fn test_value_length_bounds_check() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Create a value that exceeds i64::MAX (simulated via the check)
        // Since we can't actually allocate > i64::MAX bytes, we test the logic path
        // by checking that reasonable values pass
        let mut buf = Vec::new();
        let large_text = "x".repeat(1000);
        let result = writer.write_cell(&mut buf, "test_col", &Value::Text(large_text), 1001000);
        assert!(result.is_ok(), "Reasonable-sized values should succeed");
    }

    #[test]
    fn test_tombstone_requires_deletion_time() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();

        // Test with valid deletion_time > min_local_deletion_time
        let result = writer.write_tombstone_cell(
            &mut buf,
            "deleted_col",
            1001000,
            1700000010, // Greater than min
        );
        assert!(result.is_ok(), "Valid deletion_time should succeed");

        // Test with deletion_time < min_local_deletion_time (should error)
        let mut buf2 = Vec::new();
        let result2 = writer.write_tombstone_cell(
            &mut buf2,
            "deleted_col",
            1001000,
            1600000000, // Less than min
        );
        assert!(result2.is_err(), "deletion_time < min should fail");
        assert!(result2
            .unwrap_err()
            .to_string()
            .contains("less than min_local_deletion_time"));
    }

    #[test]
    fn test_column_bitmap_skips_nulls() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Write "name" with value, "age" with NULL
        // Schema has 2 regular columns sorted alphabetically: [age(0), name(1)]
        // "age" is NULL (missing) → bit 0 = 1
        // "name" is present → bit 1 = 0
        // bitmap = 0b01 = 0x01
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::Write {
                    column: "age".to_string(),
                    value: Value::Null,
                },
            ],
            1001000,
            None,
        );

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // Cassandra format: single VUInt bitmask where bit=1 means MISSING
        // Only "age" (index 0) is missing → bitmap = 0x01
        assert_eq!(
            buf,
            vec![0x01],
            "Bitmap should encode age as missing (bit 0)"
        );
    }

    #[test]
    fn test_row_with_null_values() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::Write {
                    column: "age".to_string(),
                    value: Value::Null, // NULL value
                },
            ],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify row flags do NOT have HAS_ALL_COLUMNS (because of NULL)
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_ALL_COLUMNS,
            0,
            "Row with NULL should NOT have HAS_ALL_COLUMNS flag"
        );
    }

    #[test]
    fn test_multiple_partitions() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        // Write first partition
        let key1 = DecoratedKey::new(100, vec![0x00, 0x00, 0x00, 0x01]);
        let table_id = TableId::new("test_ks", "test_table");
        let pk1 = PartitionKey::single("id", Value::Integer(1));
        let mutations1 = vec![Mutation::new(
            table_id.clone(),
            pk1,
            None,
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Alice".to_string()),
            }],
            1001000,
            None,
        )];

        let offset1 = writer
            .write_partition(&key1, &mutations1, &schema, None, &[])
            .unwrap();
        assert_eq!(offset1, 0);

        // Write second partition
        let key2 = DecoratedKey::new(200, vec![0x00, 0x00, 0x00, 0x02]);
        let pk2 = PartitionKey::single("id", Value::Integer(2));
        let mutations2 = vec![Mutation::new(
            table_id,
            pk2,
            None,
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Bob".to_string()),
            }],
            1002000,
            None,
        )];

        let offset2 = writer
            .write_partition(&key2, &mutations2, &schema, None, &[])
            .unwrap();
        assert!(offset2 > offset1); // Second partition starts after first

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Both partitions should have end-of-partition markers
        // Note: END_OF_PARTITION (0x01) may appear elsewhere (e.g., in cell flags)
        // For this test, we verify the file structure is valid and both partitions were written
        assert!(
            offset2 > offset1,
            "Second partition should start after first"
        );

        // The last byte should be an END_OF_PARTITION marker
        assert_eq!(
            bytes[bytes.len() - 1],
            END_OF_PARTITION,
            "File should end with END_OF_PARTITION"
        );
    }

    // ========== M5.2 Tombstone Tests ==========

    #[test]
    fn test_row_tombstone() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::DeleteRow],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify row flags have HAS_DELETION
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_DELETION,
            ROW_HAS_DELETION,
            "Should have HAS_DELETION flag"
        );
        // Issue #717: a pure row tombstone carries no primary-key liveness —
        // Cassandra serializes DELETE-d rows without HAS_TIMESTAMP.
        assert_eq!(
            flags & ROW_HAS_TIMESTAMP,
            0,
            "Pure row tombstone must not have HAS_TIMESTAMP"
        );
        assert_eq!(
            flags & ROW_HAS_ALL_COLUMNS,
            0,
            "Row tombstone must not claim all columns"
        );

        // Issue #717: the columns subset must follow the deletion times.
        // Layout: [flags][row_size][prev_size=0][deletion mfda][deletion ldt][subset]
        // With create_test_stats baselines both deletion deltas and the
        // all-missing subset are single-byte VInts.
        let row_size = bytes[1] as usize;
        // Body = prev_size(1) + mfda(vint) + ldt(vint) + subset(vint ≥ 1 byte)
        assert!(
            row_size >= 4,
            "Row tombstone body must include the columns subset (got row_size={})",
            row_size
        );
        // The final body byte is the all-missing subset bitmask: 2 regular
        // columns (name, value) in create_test_schema → 0b11.
        let body_end = 2 + row_size; // flags + row_size byte + body
        assert_eq!(
            bytes[body_end - 1],
            0b11,
            "Columns subset must mark every regular column missing"
        );
    }

    #[test]
    fn test_partition_tombstone() {
        use crate::storage::write_engine::mutation::PartitionTombstone;

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let key = DecoratedKey::new(12345, vec![0x00, 0x00, 0x00, 0x2A]);
        let tombstone = PartitionTombstone {
            deletion_time: 1001000,          // microseconds
            local_deletion_time: 1700000010, // seconds
        };

        writer
            .write_partition_header(&key, Some(&tombstone))
            .unwrap();

        let bytes = writer.finish().unwrap();

        // Verify structure (Cassandra BigFormat):
        // [0x00, 0x04] key length (u16 BE)
        // [key bytes]
        // [local_deletion_time: i32 BE]
        // [deletion_timestamp: i64 BE]
        assert_eq!(&bytes[0..2], &[0x00, 0x04], "Key length (u16 BE)");

        // Check local_deletion_time (i32 BE at offset 6)
        let ldt_bytes = &bytes[6..10];
        let ldt = i32::from_be_bytes([ldt_bytes[0], ldt_bytes[1], ldt_bytes[2], ldt_bytes[3]]);
        assert_eq!(ldt, 1700000010, "Local deletion time should match");

        // Check deletion_timestamp (i64 BE at offset 10)
        let ts_bytes = &bytes[10..18];
        let ts = i64::from_be_bytes([
            ts_bytes[0],
            ts_bytes[1],
            ts_bytes[2],
            ts_bytes[3],
            ts_bytes[4],
            ts_bytes[5],
            ts_bytes[6],
            ts_bytes[7],
        ]);
        assert_eq!(ts, 1001000, "Deletion timestamp should match");
    }

    #[test]
    fn test_range_tombstone_inclusive_bounds() {
        use crate::storage::write_engine::mutation::{ClusteringBound, RangeTombstone};

        let mut schema = create_test_schema();
        schema.clustering_keys = vec![ClusteringColumn {
            name: "ts".to_string(),
            data_type: "timestamp".to_string(),
            position: 0,
            order: ClusteringOrder::Asc,
        }];

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let range = RangeTombstone {
            start: ClusteringBound::Inclusive(ClusteringKey::single("ts", Value::Timestamp(1000))),
            end: ClusteringBound::Inclusive(ClusteringKey::single("ts", Value::Timestamp(2000))),
            deletion_time: 1001000,
            local_deletion_time: 1700000010,
        };

        let open_size = writer
            .write_range_bound(
                &range.start,
                true,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                0,
            )
            .unwrap();
        writer
            .write_range_bound(
                &range.end,
                false,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                open_size as u64,
            )
            .unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify opening bound: Cassandra ClusteringPrefix.Kind ordinals
        assert_eq!(bytes[0], IS_MARKER, "Should have IS_MARKER flag");
        assert_eq!(
            bytes[1], INCL_START_BOUND,
            "Should have INCL_START_BOUND kind (ordinal 1)"
        );
        // u16 BE cluster count follows the kind byte
        assert_eq!(
            u16::from_be_bytes([bytes[2], bytes[3]]),
            1,
            "Bound carries one clustering value"
        );

        // Closing bound starts right after the opening marker
        assert_eq!(bytes[open_size], IS_MARKER);
        assert_eq!(
            bytes[open_size + 1],
            INCL_END_BOUND,
            "Should have INCL_END_BOUND kind (ordinal 6)"
        );
    }

    #[test]
    fn test_range_tombstone_exclusive_bounds() {
        use crate::storage::write_engine::mutation::{ClusteringBound, RangeTombstone};

        let mut schema = create_test_schema();
        schema.clustering_keys = vec![ClusteringColumn {
            name: "ts".to_string(),
            data_type: "timestamp".to_string(),
            position: 0,
            order: ClusteringOrder::Asc,
        }];

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let range = RangeTombstone {
            start: ClusteringBound::Exclusive(ClusteringKey::single("ts", Value::Timestamp(1000))),
            end: ClusteringBound::Exclusive(ClusteringKey::single("ts", Value::Timestamp(2000))),
            deletion_time: 1001000,
            local_deletion_time: 1700000010,
        };

        let open_size = writer
            .write_range_bound(
                &range.start,
                true,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                0,
            )
            .unwrap();
        writer
            .write_range_bound(
                &range.end,
                false,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                open_size as u64,
            )
            .unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify opening bound: Cassandra ClusteringPrefix.Kind ordinals
        assert_eq!(bytes[0], IS_MARKER, "Should have IS_MARKER flag");
        assert_eq!(
            bytes[1], EXCL_START_BOUND,
            "Should have EXCL_START_BOUND kind (ordinal 7)"
        );
        assert_eq!(
            bytes[open_size + 1],
            EXCL_END_BOUND,
            "Should have EXCL_END_BOUND kind (ordinal 0)"
        );
    }

    #[test]
    fn test_range_tombstone_bottom_top_bounds() {
        use crate::storage::write_engine::mutation::{ClusteringBound, RangeTombstone};

        let mut schema = create_test_schema();
        schema.clustering_keys = vec![ClusteringColumn {
            name: "ts".to_string(),
            data_type: "timestamp".to_string(),
            position: 0,
            order: ClusteringOrder::Asc,
        }];

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        // Delete everything from start to end of partition
        let range = RangeTombstone {
            start: ClusteringBound::Bottom,
            end: ClusteringBound::Top,
            deletion_time: 1001000,
            local_deletion_time: 1700000010,
        };

        let open_size = writer
            .write_range_bound(
                &range.start,
                true,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                0,
            )
            .unwrap();
        writer
            .write_range_bound(
                &range.end,
                false,
                range.deletion_time,
                range.local_deletion_time,
                &schema,
                open_size as u64,
            )
            .unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Bottom serializes as an inclusive start bound with zero clustering
        // values (u16 count = 0, no clustering header byte).
        assert_eq!(bytes[0], IS_MARKER, "Should have IS_MARKER flag");
        assert_eq!(
            bytes[1], INCL_START_BOUND,
            "Bottom should serialize as INCL_START_BOUND"
        );
        assert_eq!(
            u16::from_be_bytes([bytes[2], bytes[3]]),
            0,
            "Bottom carries no clustering values"
        );
        // Top serializes as an inclusive end bound with zero values
        assert_eq!(bytes[open_size + 1], INCL_END_BOUND);
    }

    #[test]
    fn test_complete_partition_with_range_tombstone() {
        use crate::storage::write_engine::mutation::{ClusteringBound, RangeTombstone};

        let mut schema = create_test_schema();
        schema.clustering_keys = vec![ClusteringColumn {
            name: "ts".to_string(),
            data_type: "timestamp".to_string(),
            position: 0,
            order: ClusteringOrder::Asc,
        }];

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let key = DecoratedKey::new(12345, vec![0x00, 0x00, 0x00, 0x01]);
        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Create mutations
        let mutations = vec![Mutation::new(
            table_id,
            pk,
            Some(ClusteringKey::single("ts", Value::Timestamp(1000))),
            vec![CellOperation::Write {
                column: "name".to_string(),
                value: Value::Text("Alice".to_string()),
            }],
            1001000,
            None,
        )];

        // Create range tombstone
        let range_tombstones = vec![RangeTombstone {
            start: ClusteringBound::Inclusive(ClusteringKey::single("ts", Value::Timestamp(500))),
            end: ClusteringBound::Inclusive(ClusteringKey::single("ts", Value::Timestamp(1500))),
            deletion_time: 1002000, // Later than row timestamp - will shadow it
            local_deletion_time: 1700000020,
        }];

        let offset = writer
            .write_partition(&key, &mutations, &schema, None, &range_tombstones)
            .unwrap();
        assert_eq!(offset, 0);

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify partition header is present (u16 BE key length)
        assert_eq!(&bytes[0..2], &[0x00, 0x04], "Key length (u16 BE)");

        // Range tombstone markers should appear before rows
        // This is validated by the structure of the output
    }

    #[test]
    fn test_write_cell_with_ttl() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 3600;
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        let timestamp = 1001000;
        let ttl_seconds = 7200;

        writer
            .write_cell_with_ttl(
                &mut buf,
                "test_col",
                &Value::Text("test".to_string()),
                timestamp,
                ttl_seconds,
            )
            .unwrap();

        assert!(!buf.is_empty());

        // First byte should be CELL_IS_EXPIRING flag (0x02)
        let flags = buf[0];
        assert_eq!(
            flags & CELL_IS_EXPIRING,
            CELL_IS_EXPIRING,
            "Should have IS_EXPIRING flag"
        );
        assert_eq!(
            flags & CELL_USE_ROW_TIMESTAMP,
            0,
            "Should NOT have USE_ROW_TIMESTAMP flag"
        );
        assert_eq!(
            flags & CELL_USE_ROW_TTL,
            0,
            "Should NOT have USE_ROW_TTL flag"
        );

        // Should contain timestamp delta, local_deletion_time delta, TTL delta, and value
        assert!(buf.len() > 10, "Should have all TTL cell fields");
    }

    #[test]
    fn test_row_with_ttl_cells() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 3600;
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::WriteWithTtl {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                    ttl_seconds: 7200,
                },
                CellOperation::Write {
                    column: "age".to_string(),
                    value: Value::Integer(30),
                },
            ],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify row flags
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_TIMESTAMP,
            ROW_HAS_TIMESTAMP,
            "Should have timestamp"
        );
        assert_eq!(
            flags & ROW_HAS_ALL_COLUMNS,
            ROW_HAS_ALL_COLUMNS,
            "Should have all columns"
        );
    }

    #[test]
    fn test_row_with_multiple_ttl_cells() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 1800;
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::WriteWithTtl {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                    ttl_seconds: 3600, // 1 hour
                },
                CellOperation::WriteWithTtl {
                    column: "age".to_string(),
                    value: Value::Integer(30),
                    ttl_seconds: 7200, // 2 hours (different TTL)
                },
            ],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Verify both cells were written with their own TTLs
        // The exact validation would require parsing the binary format
    }

    #[test]
    fn test_mixed_ttl_and_regular_cells() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 3600;
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::WriteWithTtl {
                    column: "age".to_string(),
                    value: Value::Integer(30),
                    ttl_seconds: 7200,
                },
            ],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Row should contain both regular and TTL cells
        let flags = bytes[0];
        assert_eq!(flags & ROW_HAS_TIMESTAMP, ROW_HAS_TIMESTAMP);
    }

    #[test]
    fn test_ttl_zero_special_case() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 0;
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        let timestamp = 1001000;
        let ttl_seconds = 0; // Immediate expiration

        writer
            .write_cell_with_ttl(
                &mut buf,
                "test_col",
                &Value::Text("test".to_string()),
                timestamp,
                ttl_seconds,
            )
            .unwrap();

        assert!(!buf.is_empty());

        // Should have IS_EXPIRING flag even with TTL=0
        let flags = buf[0];
        assert_eq!(flags & CELL_IS_EXPIRING, CELL_IS_EXPIRING);
    }

    #[test]
    fn test_ttl_statistics_tracking() {
        let mut stats = StatisticsMetadata::new();

        // Update with various TTL values
        stats.update_ttl(3600);
        stats.update_ttl(7200);
        stats.update_ttl(1800);
        stats.update_ttl(0); // TTL=0 should be ignored

        assert_eq!(stats.min_ttl, 1800, "min_ttl should be 1800");
        assert_eq!(stats.max_ttl, 7200, "max_ttl should be 7200");
    }

    #[test]
    fn test_ttl_cell_with_null_value() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        let result = writer.write_cell_with_ttl(&mut buf, "test_col", &Value::Null, 1001000, 3600);

        assert!(result.is_err(), "NULL values should return error");
        assert!(result
            .unwrap_err()
            .to_string()
            .contains("NULL values should not be written"));
    }

    #[test]
    fn test_ttl_cell_local_deletion_time_calculation() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1000000;
        stats.min_local_deletion_time = 1700000000;
        stats.min_ttl = 3600;
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        let timestamp = 1001000;
        let ttl_seconds = 7200; // 2 hours

        // The local_deletion_time should be computed as current_time + ttl_seconds
        writer
            .write_cell_with_ttl(
                &mut buf,
                "test_col",
                &Value::Text("test".to_string()),
                timestamp,
                ttl_seconds,
            )
            .unwrap();

        assert!(!buf.is_empty());
        // Detailed validation would require parsing the encoded deltas
    }

    #[test]
    fn test_row_ttl_uses_row_ttl_cell_flags() {
        let mut stats = create_test_stats();
        stats.min_timestamp = 1001000;
        stats.min_ttl = 7200;
        stats.min_local_deletion_time = 1;
        let mut writer = DataWriter::new(stats);
        let schema = create_test_schema();

        let mutation = Mutation::new(
            TableId::new("test_ks", "test_table"),
            PartitionKey::single("id", Value::Integer(1)),
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::Write {
                    column: "age".to_string(),
                    value: Value::Integer(30),
                },
            ],
            1001000,
            Some(7200),
        );

        writer.write_row(&mutation, &schema).unwrap();
        let bytes = writer.finish().unwrap();

        assert_eq!(bytes[0] & ROW_HAS_TTL, ROW_HAS_TTL);
        let expiring_row_ttl_flags = CELL_IS_EXPIRING | CELL_USE_ROW_TIMESTAMP | CELL_USE_ROW_TTL;
        let flag_count = bytes
            .iter()
            .filter(|&&byte| byte == expiring_row_ttl_flags)
            .count();
        assert_eq!(flag_count, 2, "expected both cells to inherit row TTL");
    }

    #[test]
    fn test_write_partition_emits_static_row_before_regular_rows() {
        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);
        let schema = create_static_test_schema();
        let key = DecoratedKey::new(1, vec![0, 0, 0, 1]);

        let static_mutation = Mutation::new(
            TableId::new("test_ks", "test_table"),
            PartitionKey::single("id", Value::Integer(1)),
            None,
            vec![CellOperation::Write {
                column: "static_val".to_string(),
                value: Value::Text("static".to_string()),
            }],
            1001000,
            None,
        );
        let regular_mutation = Mutation::new(
            TableId::new("test_ks", "test_table"),
            PartitionKey::single("id", Value::Integer(1)),
            Some(ClusteringKey::single("ck", Value::Integer(1))),
            vec![CellOperation::Write {
                column: "regular_val".to_string(),
                value: Value::Text("regular".to_string()),
            }],
            1002000,
            None,
        );

        writer
            .write_partition(
                &key,
                &[static_mutation, regular_mutation],
                &schema,
                None,
                &[],
            )
            .unwrap();
        let bytes = writer.finish().unwrap();

        let partition_header_len = 2 + key.key.len() + 4 + 8;
        assert_eq!(
            bytes[partition_header_len] & ROW_HAS_EXTENDED_FLAGS,
            ROW_HAS_EXTENDED_FLAGS
        );
        assert_eq!(bytes[partition_header_len + 1], EXTENDED_IS_STATIC);
    }

    /// Cassandra switches to large-subset encoding when the superset reaches 64 columns.
    #[test]
    fn test_column_subset_exactly_64_regular_columns_uses_large_subset_encoding() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Create schema with exactly 64 regular columns
        let columns: Vec<Column> = (0..64)
            .map(|i| Column {
                name: format!("col_{:03}", i),
                data_type: "text".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            })
            .collect();

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns,
            comments: HashMap::new(),
        };

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Only write col_0 and col_63, forcing the large-subset path.
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "col_000".to_string(),
                    value: Value::Text("first".to_string()),
                },
                CellOperation::Write {
                    column: "col_063".to_string(),
                    value: Value::Text("last".to_string()),
                },
            ],
            1001000,
            None,
        );

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // missing_count=62, then present indexes [0, 63]
        assert_eq!(buf, vec![62, 0, 63]);
    }

    /// Large static-column subsets use the same delta encoding as regular columns.
    #[test]
    fn test_column_subset_65_static_columns_uses_missing_indexes_when_present_majority() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Create schema with 65 static columns
        let columns: Vec<Column> = (0..65)
            .map(|i| Column {
                name: format!("scol_{:03}", i),
                data_type: "text".to_string(),
                nullable: true,
                default: None,
                is_static: true,
            })
            .collect();

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![ClusteringColumn {
                name: "ck".to_string(),
                data_type: "int".to_string(),
                position: 0,
                order: ClusteringOrder::Asc,
            }],
            columns,
            comments: HashMap::new(),
        };

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Write all but one static column so the encoding emits missing indexes.
        let mut operations = Vec::new();
        for i in 0..65 {
            if i == 17 {
                continue;
            }
            operations.push(CellOperation::Write {
                column: format!("scol_{:03}", i),
                value: Value::Text(format!("value-{}", i)),
            });
        }

        let mutation = Mutation::new(table_id, pk, None, operations, 1001000, None);

        let mut buf = Vec::new();
        writer
            .write_static_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // missing_count=1, followed by the missing column index.
        assert_eq!(buf, vec![1, 17]);
    }

    /// Smaller subsets still use the missing-column bitmap.
    #[test]
    fn test_column_subset_under_64_regular_columns_uses_bitmap() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let columns: Vec<Column> = (0..4)
            .map(|i| Column {
                name: format!("col_{i}"),
                data_type: "text".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            })
            .collect();

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns,
            comments: HashMap::new(),
        };

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        // Only col_1 is present, so bits 0, 2, and 3 are set.
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "col_1".to_string(),
                value: Value::Text("present".to_string()),
            }],
            1001000,
            None,
        );

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        assert_eq!(buf, vec![0b1101]);
    }

    #[test]
    fn test_regular_columns_sort_simple_before_complex() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![
                Column {
                    name: "z_simple".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "a_complex".to_string(),
                    data_type: "set<text>".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "m_simple".to_string(),
                    data_type: "int".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        };

        let ordered = writer.regular_columns(&schema);
        let names: Vec<_> = ordered.iter().map(|column| column.name.as_str()).collect();

        assert_eq!(names, vec!["m_simple", "z_simple", "a_complex"]);
    }

    #[test]
    fn test_static_columns_sort_simple_before_complex() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![ClusteringColumn {
                name: "ck".to_string(),
                data_type: "int".to_string(),
                position: 0,
                order: ClusteringOrder::Asc,
            }],
            columns: vec![
                Column {
                    name: "z_static_simple".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: true,
                },
                Column {
                    name: "a_static_complex".to_string(),
                    data_type: "set<text>".to_string(),
                    nullable: true,
                    default: None,
                    is_static: true,
                },
                Column {
                    name: "m_static_simple".to_string(),
                    data_type: "int".to_string(),
                    nullable: true,
                    default: None,
                    is_static: true,
                },
            ],
            comments: HashMap::new(),
        };

        let ordered = writer.static_columns(&schema);
        let names: Vec<_> = ordered.iter().map(|column| column.name.as_str()).collect();

        assert_eq!(
            names,
            vec!["m_static_simple", "z_static_simple", "a_static_complex"]
        );
    }

    #[test]
    fn test_write_column_bitmap_zero_when_all_columns_present() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let columns: Vec<Column> = (0..65)
            .map(|i| Column {
                name: format!("col_{:03}", i),
                data_type: "text".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            })
            .collect();

        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns,
            comments: HashMap::new(),
        };

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));

        let operations: Vec<_> = (0..65)
            .map(|i| CellOperation::Write {
                column: format!("col_{:03}", i),
                value: Value::Text(format!("value-{}", i)),
            })
            .collect();

        let mutation = Mutation::new(table_id, pk, None, operations, 1001000, None);

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        assert_eq!(buf, vec![0]);
    }

    #[test]
    fn test_serialize_list() {
        let list = Value::List(vec![
            Value::Integer(1),
            Value::Integer(2),
            Value::Integer(3),
        ]);
        let bytes = serialize_value(&list).unwrap();
        // 4 bytes count + 3 * (4 bytes len + 4 bytes i32)
        assert_eq!(bytes.len(), 4 + 3 * 8);
        // Count = 3
        assert_eq!(&bytes[0..4], &3i32.to_be_bytes());
        // First element length = 4
        assert_eq!(&bytes[4..8], &4i32.to_be_bytes());
        // First element value = 1
        assert_eq!(&bytes[8..12], &1i32.to_be_bytes());
    }

    #[test]
    fn test_serialize_empty_list() {
        let list = Value::List(vec![]);
        let bytes = serialize_value(&list).unwrap();
        assert_eq!(bytes.len(), 4);
        assert_eq!(&bytes[0..4], &0i32.to_be_bytes());
    }

    #[test]
    fn test_serialize_single_element_list() {
        let list = Value::List(vec![Value::Integer(42)]);
        let bytes = serialize_value(&list).unwrap();
        assert_eq!(
            bytes,
            vec![
                0x00, 0x00, 0x00, 0x01, // count = 1
                0x00, 0x00, 0x00, 0x04, // len = 4
                0x00, 0x00, 0x00, 0x2A, // value = 42
            ]
        );
    }

    #[test]
    fn test_serialize_set() {
        let set = Value::Set(vec![
            Value::Text("alpha".to_string()),
            Value::Text("beta".to_string()),
        ]);
        let bytes = serialize_value(&set).unwrap();
        // Count = 2
        assert_eq!(&bytes[0..4], &2i32.to_be_bytes());
        // First element length = 5 ("alpha")
        assert_eq!(&bytes[4..8], &5i32.to_be_bytes());
        assert_eq!(&bytes[8..13], b"alpha");
    }

    #[test]
    fn test_serialize_single_element_set() {
        let set = Value::Set(vec![Value::Text("alpha".to_string())]);
        let bytes = serialize_value(&set).unwrap();
        assert_eq!(
            bytes,
            vec![
                0x00, 0x00, 0x00, 0x01, // count = 1
                0x00, 0x00, 0x00, 0x05, // len = 5
                b'a', b'l', b'p', b'h', b'a', // value = "alpha"
            ]
        );
    }

    #[test]
    fn test_serialize_empty_set() {
        let set = Value::Set(vec![]);
        let bytes = serialize_value(&set).unwrap();
        assert_eq!(bytes, 0i32.to_be_bytes().to_vec());
    }

    #[test]
    fn test_serialize_map() {
        let map = Value::Map(vec![(Value::Text("key1".to_string()), Value::Integer(100))]);
        let bytes = serialize_value(&map).unwrap();
        // Count = 1
        assert_eq!(&bytes[0..4], &1i32.to_be_bytes());
        // Key length = 4 ("key1")
        assert_eq!(&bytes[4..8], &4i32.to_be_bytes());
        assert_eq!(&bytes[8..12], b"key1");
        // Value length = 4 (i32)
        assert_eq!(&bytes[12..16], &4i32.to_be_bytes());
        // Value = 100
        assert_eq!(&bytes[16..20], &100i32.to_be_bytes());
    }

    #[test]
    fn test_serialize_empty_map() {
        let map = Value::Map(vec![]);
        let bytes = serialize_value(&map).unwrap();
        assert_eq!(bytes.len(), 4);
        assert_eq!(&bytes[0..4], &0i32.to_be_bytes());
    }

    #[test]
    fn test_serialize_tuple() {
        let tuple = Value::Tuple(vec![
            Value::Integer(42),
            Value::Text("hello".to_string()),
            Value::Null,
        ]);
        let bytes = serialize_value(&tuple).unwrap();
        // Field 1: 4 bytes len + 4 bytes i32 = 8
        assert_eq!(&bytes[0..4], &4i32.to_be_bytes());
        assert_eq!(&bytes[4..8], &42i32.to_be_bytes());
        // Field 2: 4 bytes len + 5 bytes text = 9
        assert_eq!(&bytes[8..12], &5i32.to_be_bytes());
        assert_eq!(&bytes[12..17], b"hello");
        // Field 3: NULL = -1 as i32
        assert_eq!(&bytes[17..21], &(-1i32).to_be_bytes());
    }

    #[test]
    fn test_serialize_single_element_tuple() {
        let tuple = Value::Tuple(vec![Value::Text("solo".to_string())]);
        let bytes = serialize_value(&tuple).unwrap();
        assert_eq!(
            bytes,
            vec![
                0x00, 0x00, 0x00, 0x04, // len = 4
                b's', b'o', b'l', b'o', // value = "solo"
            ]
        );
    }

    #[test]
    fn test_serialize_frozen() {
        let frozen = Value::Frozen(Box::new(Value::List(vec![
            Value::Integer(10),
            Value::Integer(20),
        ])));
        let frozen_bytes = serialize_value(&frozen).unwrap();
        let list_bytes =
            serialize_value(&Value::List(vec![Value::Integer(10), Value::Integer(20)])).unwrap();
        // Frozen should produce identical bytes to inner value
        assert_eq!(frozen_bytes, list_bytes);
    }

    #[test]
    fn test_serialize_single_element_frozen() {
        let frozen = Value::Frozen(Box::new(Value::List(vec![Value::Text("solo".to_string())])));
        let frozen_bytes = serialize_value(&frozen).unwrap();
        let list_bytes =
            serialize_value(&Value::List(vec![Value::Text("solo".to_string())])).unwrap();
        assert_eq!(frozen_bytes, list_bytes);
    }

    #[test]
    fn test_serialize_nested_collection() {
        // MAP<TEXT, FROZEN<LIST<INT>>>
        let nested = Value::Map(vec![(
            Value::Text("nums".to_string()),
            Value::Frozen(Box::new(Value::List(vec![
                Value::Integer(1),
                Value::Integer(2),
            ]))),
        )]);
        let bytes = serialize_value(&nested).unwrap();
        // Should not error - validates nested serialization works
        assert!(!bytes.is_empty());
        // Count = 1
        assert_eq!(&bytes[0..4], &1i32.to_be_bytes());
    }

    #[test]
    fn test_serialize_udt_with_nested_collections_matches_schema_aware_bytes() {
        let serializer = TypeSerializer::new();
        let company = phase3_company_value();

        let bytes = serialize_value(&Value::Udt(company.clone())).unwrap();
        let expected = serializer
            .serialize_udt(&Value::Udt(company), &phase3_company_schema())
            .unwrap();

        assert_eq!(bytes, expected);
    }

    #[test]
    fn test_serialize_collection_containing_nested_udts() {
        let serializer = TypeSerializer::new();
        let company = phase3_company_value();
        let company_bytes = serializer
            .serialize_udt(&Value::Udt(company.clone()), &phase3_company_schema())
            .unwrap();

        let value = Value::Map(vec![(
            Value::Text("empresa_日本".to_string()),
            Value::Frozen(Box::new(Value::Udt(company))),
        )]);
        let bytes = serialize_value(&value).unwrap();

        let key = "empresa_日本".as_bytes();
        let mut expected = Vec::new();
        expected.extend_from_slice(&1i32.to_be_bytes());
        expected.extend_from_slice(&(key.len() as i32).to_be_bytes());
        expected.extend_from_slice(key);
        expected.extend_from_slice(&(company_bytes.len() as i32).to_be_bytes());
        expected.extend_from_slice(&company_bytes);

        assert_eq!(bytes, expected);
    }

    #[test]
    fn test_serialize_tuple_with_collection_fields_and_udt() {
        let serializer = TypeSerializer::new();
        let address = phase3_address_value();
        let person = phase3_person_value("Tuple User");
        let address_bytes = serializer
            .serialize_udt(&Value::Udt(address.clone()), &phase3_address_schema())
            .unwrap();
        let person_bytes = serializer
            .serialize_udt(&Value::Udt(person.clone()), &phase3_person_schema())
            .unwrap();

        let tuple = Value::Tuple(vec![
            Value::Text("phase3".to_string()),
            Value::Frozen(Box::new(Value::List(vec![
                Value::Integer(3),
                Value::Integer(5),
                Value::Integer(8),
            ]))),
            Value::Frozen(Box::new(Value::Map(vec![(
                Value::Text("home".to_string()),
                Value::Frozen(Box::new(Value::Udt(address))),
            )]))),
            Value::Frozen(Box::new(Value::Udt(person))),
        ]);
        let bytes = serialize_value(&tuple).unwrap();

        let list_bytes = serialize_value(&Value::List(vec![
            Value::Integer(3),
            Value::Integer(5),
            Value::Integer(8),
        ]))
        .unwrap();
        let map_bytes = {
            let key = b"home";
            let mut encoded = Vec::new();
            encoded.extend_from_slice(&1i32.to_be_bytes());
            encoded.extend_from_slice(&(key.len() as i32).to_be_bytes());
            encoded.extend_from_slice(key);
            encoded.extend_from_slice(&(address_bytes.len() as i32).to_be_bytes());
            encoded.extend_from_slice(&address_bytes);
            encoded
        };

        let mut expected = Vec::new();
        expected.extend_from_slice(&6i32.to_be_bytes());
        expected.extend_from_slice(b"phase3");
        expected.extend_from_slice(&(list_bytes.len() as i32).to_be_bytes());
        expected.extend_from_slice(&list_bytes);
        expected.extend_from_slice(&(map_bytes.len() as i32).to_be_bytes());
        expected.extend_from_slice(&map_bytes);
        expected.extend_from_slice(&(person_bytes.len() as i32).to_be_bytes());
        expected.extend_from_slice(&person_bytes);

        assert_eq!(bytes, expected);
    }

    #[test]
    fn test_serialize_high_complexity_nested_collection() {
        let nested = Value::Map(vec![(
            Value::Text("outer".to_string()),
            Value::Frozen(Box::new(Value::List(vec![Value::Frozen(Box::new(
                Value::Map(vec![(
                    Value::Text("inner".to_string()),
                    Value::Frozen(Box::new(Value::List(vec![
                        Value::Integer(1),
                        Value::Integer(2),
                    ]))),
                )]),
            ))]))),
        )]);

        let bytes = serialize_value(&nested).unwrap();

        assert!(!bytes.is_empty());
        assert_eq!(&bytes[0..4], &1i32.to_be_bytes());
    }

    // ========== Complex Column (Multi-Cell) Tests ==========

    #[test]
    fn test_is_complex_column() {
        // Non-frozen collections ARE complex (CQL syntax)
        assert!(is_complex_column("set<int>"));
        assert!(is_complex_column("list<text>"));
        assert!(is_complex_column("map<text, int>"));
        assert!(is_complex_column("SET<INT>"));
        assert!(is_complex_column("List<Text>"));
        assert!(is_complex_column("Map<Text, Int>"));

        // Non-frozen collections ARE complex (Cassandra internal syntax)
        assert!(is_complex_column(
            "org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type)"
        ));
        assert!(is_complex_column(
            "org.apache.cassandra.db.marshal.ListType(org.apache.cassandra.db.marshal.UTF8Type)"
        ));
        assert!(is_complex_column(
            "org.apache.cassandra.db.marshal.MapType(org.apache.cassandra.db.marshal.UTF8Type,org.apache.cassandra.db.marshal.Int32Type)"
        ));

        // Frozen collections are NOT complex (CQL syntax)
        assert!(!is_complex_column("frozen<set<int>>"));
        assert!(!is_complex_column("frozen<list<text>>"));
        assert!(!is_complex_column("frozen<map<text, int>>"));
        assert!(!is_complex_column("FROZEN<SET<INT>>"));

        // Frozen collections are NOT complex (Cassandra internal syntax)
        assert!(!is_complex_column(
            "org.apache.cassandra.db.marshal.FrozenType(org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.Int32Type))"
        ));

        // Primitives are NOT complex
        assert!(!is_complex_column("int"));
        assert!(!is_complex_column("text"));
        assert!(!is_complex_column("uuid"));
        assert!(!is_complex_column("timestamp"));
    }

    #[test]
    fn test_generate_list_cell_path_timeuuid() {
        let ts = 1_704_067_200_000_000i64; // 2024-01-01 00:00:00 UTC

        let uuid0 = generate_list_cell_path_timeuuid(ts, 0);
        let uuid1 = generate_list_cell_path_timeuuid(ts, 1);
        let uuid2 = generate_list_cell_path_timeuuid(ts, 2);

        // All should be 16 bytes
        assert_eq!(uuid0.len(), 16);
        assert_eq!(uuid1.len(), 16);

        // Version bits should be 1 (0x1X in byte 6)
        assert_eq!(uuid0[6] & 0xF0, 0x10, "Should be UUID version 1");
        assert_eq!(uuid1[6] & 0xF0, 0x10, "Should be UUID version 1");

        // UUIDs should be monotonically increasing (as byte arrays)
        assert!(uuid0 < uuid1, "UUID0 should be less than UUID1");
        assert!(uuid1 < uuid2, "UUID1 should be less than UUID2");
    }

    #[test]
    fn test_write_set_complex_column() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "tags".to_string(),
            data_type: "set<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::Set(vec![
            Value::Text("alpha".to_string()),
            Value::Text("beta".to_string()),
        ]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        assert!(!buf.is_empty());

        // Parse: complex_deletion (2 signed VInts) + cell_count + cells
        // The first bytes are DeletionTime.LIVE encoded as signed VInts
        // Then cell_count = 2 (unsigned VInt)
        // Then for each cell: flags(1) + path_len(VInt) + path_bytes + (no value for SET)

        // Verify we can find 2 cell flag bytes with USE_ROW_TIMESTAMP | HAS_EMPTY_VALUE = 0x0C
        let expected_cell_flags = CELL_USE_ROW_TIMESTAMP | CELL_HAS_EMPTY_VALUE;
        let cell_flag_count = buf.iter().filter(|&&b| b == expected_cell_flags).count();
        assert_eq!(
            cell_flag_count, 2,
            "Should have 2 SET cells with USE_ROW_TIMESTAMP | HAS_EMPTY_VALUE flags"
        );
    }

    #[test]
    fn test_write_map_complex_column() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "props".to_string(),
            data_type: "map<text, int>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::Map(vec![
            (Value::Text("key1".to_string()), Value::Integer(100)),
            (Value::Text("key2".to_string()), Value::Integer(200)),
        ]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        assert!(!buf.is_empty());

        // MAP cells have USE_ROW_TIMESTAMP (0x08) but NOT HAS_EMPTY_VALUE
        let cell_flag_count = buf.iter().filter(|&&b| b == CELL_USE_ROW_TIMESTAMP).count();
        assert_eq!(
            cell_flag_count, 2,
            "Should have 2 MAP cells with USE_ROW_TIMESTAMP flags"
        );
    }

    #[test]
    fn test_write_list_complex_column() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "items".to_string(),
            data_type: "list<int>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::List(vec![Value::Integer(10), Value::Integer(20)]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        assert!(!buf.is_empty());

        // LIST cells have USE_ROW_TIMESTAMP (0x08) and 16-byte TimeUUID paths
        let cell_flag_count = buf.iter().filter(|&&b| b == CELL_USE_ROW_TIMESTAMP).count();
        assert_eq!(
            cell_flag_count, 2,
            "Should have 2 LIST cells with USE_ROW_TIMESTAMP flags"
        );

        // Verify TimeUUID path length (16) appears in the output
        // Each cell has: flags(1) + path_len_vint(1, value=16=0x10) + path(16) + val_len + val
        // The VInt encoding of 16 is 0x10
        let timeuuid_len_count = buf.iter().filter(|&&b| b == 0x10).count();
        assert!(
            timeuuid_len_count >= 2,
            "Should have TimeUUID path length (16) for each list cell"
        );
    }

    #[test]
    fn test_frozen_collection_not_complex() {
        // Frozen collections should still use simple cell (serialize_value), not complex column
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![Column {
                name: "frozen_tags".to_string(),
                data_type: "frozen<set<text>>".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            }],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "frozen_tags".to_string(),
                value: Value::Frozen(Box::new(Value::Set(vec![
                    Value::Text("a".to_string()),
                    Value::Text("b".to_string()),
                ]))),
            }],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Frozen collection should NOT have HAS_COMPLEX_DELETION flag
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_COMPLEX_DELETION,
            0,
            "Frozen collection should NOT have HAS_COMPLEX_DELETION flag"
        );
    }

    #[test]
    fn test_mixed_simple_and_complex_columns() {
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![
                Column {
                    name: "name".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "tags".to_string(),
                    data_type: "set<text>".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
                CellOperation::Write {
                    column: "tags".to_string(),
                    value: Value::Set(vec![
                        Value::Text("admin".to_string()),
                        Value::Text("user".to_string()),
                    ]),
                },
            ],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Row should have HAS_COMPLEX_DELETION flag because of the SET column
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_COMPLEX_DELETION,
            ROW_HAS_COMPLEX_DELETION,
            "Row with non-frozen SET should have HAS_COMPLEX_DELETION flag"
        );
        assert_eq!(
            flags & ROW_HAS_TIMESTAMP,
            ROW_HAS_TIMESTAMP,
            "Should have timestamp"
        );
        assert_eq!(
            flags & ROW_HAS_ALL_COLUMNS,
            ROW_HAS_ALL_COLUMNS,
            "Should have all columns"
        );
    }

    #[test]
    fn test_set_canonical_ordering() {
        // Elements provided out of order should be sorted by serialized bytes
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "tags".to_string(),
            data_type: "set<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        // Input: zebra, alpha, mango (unsorted)
        let value = Value::Set(vec![
            Value::Text("zebra".to_string()),
            Value::Text("alpha".to_string()),
            Value::Text("mango".to_string()),
        ]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        // Extract cell paths from the binary output.
        // After complex deletion (2 VInts) and cell count (1 VInt), each cell is:
        //   flags(1) + path_len(VInt) + path_bytes
        // Find the text values in order by scanning for ASCII strings.
        let buf_str = String::from_utf8_lossy(&buf);
        let alpha_pos = buf_str.find("alpha").expect("alpha should be in output");
        let mango_pos = buf_str.find("mango").expect("mango should be in output");
        let zebra_pos = buf_str.find("zebra").expect("zebra should be in output");

        assert!(
            alpha_pos < mango_pos && mango_pos < zebra_pos,
            "SET elements should be in sorted order: alpha({}) < mango({}) < zebra({})",
            alpha_pos,
            mango_pos,
            zebra_pos
        );
    }

    #[test]
    fn test_map_canonical_ordering() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "props".to_string(),
            data_type: "map<text, int>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        // Input: keys out of order (z_key, a_key)
        let value = Value::Map(vec![
            (Value::Text("z_key".to_string()), Value::Integer(1)),
            (Value::Text("a_key".to_string()), Value::Integer(2)),
        ]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        let buf_str = String::from_utf8_lossy(&buf);
        let a_pos = buf_str.find("a_key").expect("a_key should be in output");
        let z_pos = buf_str.find("z_key").expect("z_key should be in output");

        assert!(
            a_pos < z_pos,
            "MAP entries should be sorted by key: a_key({}) < z_key({})",
            a_pos,
            z_pos
        );
    }

    #[test]
    fn test_set_rejects_list_value() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "tags".to_string(),
            data_type: "set<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        // Pass a List value to a SET column — should be rejected
        let value = Value::List(vec![Value::Text("x".to_string())]);
        let mut buf = Vec::new();
        let result = writer.write_complex_column(&mut buf, &column, &value, 1001000, None);
        assert!(result.is_err(), "SET column should reject Value::List");
    }

    #[test]
    fn test_list_rejects_set_value() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "items".to_string(),
            data_type: "list<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        // Pass a Set value to a LIST column — should be rejected
        let value = Value::Set(vec![Value::Text("x".to_string())]);
        let mut buf = Vec::new();
        let result = writer.write_complex_column(&mut buf, &column, &value, 1001000, None);
        assert!(result.is_err(), "LIST column should reject Value::Set");
    }

    #[test]
    fn test_complex_column_deletion() {
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let mut buf = Vec::new();
        writer
            .write_complex_column_deletion(&mut buf, 1001000)
            .unwrap();

        assert!(!buf.is_empty());

        // Should contain: marked_for_delete_at delta + local_deletion_time delta + cell_count(0)
        // The last byte should be 0x00 (cell_count = 0 encoded as unsigned VInt)
        assert_eq!(
            buf[buf.len() - 1],
            0x00,
            "Last byte should be cell_count = 0"
        );
    }

    #[test]
    fn test_write_with_ttl_complex_column() {
        // WriteWithTtl on a complex column should use complex format, not simple cell
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![Column {
                name: "tags".to_string(),
                data_type: "set<text>".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            }],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::WriteWithTtl {
                column: "tags".to_string(),
                value: Value::Set(vec![
                    Value::Text("a".to_string()),
                    Value::Text("b".to_string()),
                ]),
                ttl_seconds: 3600,
            }],
            1001000,
            None,
        );

        // Should succeed without error — complex format should be used
        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Should have HAS_COMPLEX_DELETION flag
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_COMPLEX_DELETION,
            ROW_HAS_COMPLEX_DELETION,
            "WriteWithTtl on SET should set HAS_COMPLEX_DELETION"
        );
    }

    #[test]
    fn test_delete_complex_column() {
        // Delete on a complex column should write complex deletion, not simple tombstone
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![Column {
                name: "tags".to_string(),
                data_type: "set<text>".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            }],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Delete {
                column: "tags".to_string(),
            }],
            1001000,
            None,
        );

        // Should succeed — uses complex deletion format
        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        assert!(!bytes.is_empty());

        // Should have HAS_COMPLEX_DELETION flag
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_COMPLEX_DELETION,
            ROW_HAS_COMPLEX_DELETION,
            "Delete on SET should set HAS_COMPLEX_DELETION"
        );
    }

    #[test]
    fn test_internal_type_string_complex_column() {
        // Cassandra internal type strings should be recognized as complex
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![Column {
                name: "tags".to_string(),
                data_type: "org.apache.cassandra.db.marshal.SetType(org.apache.cassandra.db.marshal.UTF8Type)".to_string(),
                nullable: true,
                default: None,
                is_static: false,
            }],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let mut writer = DataWriter::new(stats);

        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Write {
                column: "tags".to_string(),
                value: Value::Set(vec![Value::Text("test".to_string())]),
            }],
            1001000,
            None,
        );

        writer.write_row(&mutation, &schema).unwrap();

        let bytes = writer.finish().unwrap();
        let flags = bytes[0];
        assert_eq!(
            flags & ROW_HAS_COMPLEX_DELETION,
            ROW_HAS_COMPLEX_DELETION,
            "Internal type string should be recognized as complex column"
        );
    }

    /// Parse a `write_complex_column` output buffer and return the flag byte for every cell.
    ///
    /// The buffer has this deterministic structure:
    /// ```text
    /// [complex_deletion_ts_delta:  unsigned VInt]  ← 2 VInts, time-derived but fixed per stats
    /// [complex_deletion_ldt_delta: unsigned VInt]
    /// [cell_count: unsigned VInt]
    /// per cell:
    ///   [flags: u8]
    ///   if IS_EXPIRING (0x02 set):
    ///     [ts_delta:  unsigned VInt]
    ///     [ldt_delta: unsigned VInt]   ← wall-clock-derived
    ///     [ttl_delta: unsigned VInt]
    ///   [path_len:  unsigned VInt]
    ///   [path_bytes: path_len]
    ///   if !HAS_EMPTY_VALUE (0x04 NOT set):
    ///     [value_len: unsigned VInt]
    ///     [value_bytes: value_len]
    /// ```
    ///
    /// Scanning the raw buffer for a flag byte value is fragile because
    /// wall-clock-derived LDT bytes can coincidentally equal the flag byte (~1-2% of CI runs).
    /// This helper walks the structure deterministically so each flag byte is read at
    /// its exact position.
    fn parse_complex_cell_flags(buf: &[u8]) -> Vec<u8> {
        /// Read one unsigned VInt from `buf` starting at `*pos`; advance `*pos`.
        fn read_uvint(buf: &[u8], pos: &mut usize) -> u64 {
            let first = buf[*pos];
            *pos += 1;
            if first == 0xFF {
                // 9-byte form: 0xFF + 8 big-endian bytes
                let mut v = 0u64;
                for _ in 0..8 {
                    v = (v << 8) | buf[*pos] as u64;
                    *pos += 1;
                }
                return v;
            }
            // Count leading 1-bits in `first` to determine extra bytes
            let extra = first.leading_ones() as usize;
            // Data bits in first byte: mask off the leading 1s and the 0 separator
            let mask = 0xFF_u8.wrapping_shr((extra + 1) as u32);
            let mut v = (first & mask) as u64;
            for _ in 0..extra {
                v = (v << 8) | buf[*pos] as u64;
                *pos += 1;
            }
            v
        }

        let mut pos = 0usize;
        // Skip complex deletion header: 2 unsigned VInts
        read_uvint(buf, &mut pos);
        read_uvint(buf, &mut pos);

        // Cell count
        let cell_count = read_uvint(buf, &mut pos) as usize;

        let mut flags_out = Vec::with_capacity(cell_count);
        for _ in 0..cell_count {
            let flags = buf[pos];
            pos += 1;
            flags_out.push(flags);

            if (flags & CELL_IS_EXPIRING) != 0 {
                // IS_EXPIRING: ts_delta + ldt_delta + ttl_delta (3 unsigned VInts)
                read_uvint(buf, &mut pos);
                read_uvint(buf, &mut pos);
                read_uvint(buf, &mut pos);
            }
            // USE_ROW_TIMESTAMP / non-expiring cells: no extra fields before path

            // Cell path: path_len VInt + path_len bytes
            let path_len = read_uvint(buf, &mut pos) as usize;
            pos += path_len;

            // Cell value: only present when HAS_EMPTY_VALUE is NOT set
            if (flags & CELL_HAS_EMPTY_VALUE) == 0 {
                let value_len = read_uvint(buf, &mut pos) as usize;
                pos += value_len;
            }
        }

        flags_out
    }

    #[test]
    fn test_set_complex_column_with_ttl() {
        // SET with TTL should write IS_EXPIRING flag per cell, not USE_ROW_TIMESTAMP.
        // Uses structural parsing to read cell flags at their exact byte positions,
        // avoiding false positives from time-derived LDT bytes that can equal 0x02.
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "tags".to_string(),
            data_type: "set<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::Set(vec![
            Value::Text("alpha".to_string()),
            Value::Text("beta".to_string()),
        ]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, Some(3600))
            .unwrap();

        // Parse cell flags structurally so wall-clock LDT bytes in the header and
        // per-cell TTL fields cannot be misidentified as flag bytes.
        let cell_flags = parse_complex_cell_flags(&buf);
        let expected_flags = CELL_IS_EXPIRING | CELL_HAS_EMPTY_VALUE; // 0x06

        assert_eq!(
            cell_flags.len(),
            2,
            "SET with 2 elements should produce 2 cells"
        );
        assert!(
            cell_flags.iter().all(|&f| f == expected_flags),
            "SET with TTL: all cells should have IS_EXPIRING | HAS_EMPTY_VALUE (0x06), got: {:?}",
            cell_flags
        );

        // Confirm absence of USE_ROW_TIMESTAMP on all cells
        assert!(
            cell_flags
                .iter()
                .all(|&f| (f & CELL_USE_ROW_TIMESTAMP) == 0),
            "SET with TTL should NOT have USE_ROW_TIMESTAMP on any cell, got: {:?}",
            cell_flags
        );
    }

    #[test]
    fn test_map_complex_column_with_ttl() {
        // MAP with TTL should write IS_EXPIRING flag per cell.
        // Uses structural parsing to read cell flags at their exact byte positions,
        // avoiding false positives from time-derived LDT bytes that can equal 0x02.
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "props".to_string(),
            data_type: "map<text, int>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::Map(vec![(Value::Text("key1".to_string()), Value::Integer(100))]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, Some(7200))
            .unwrap();

        // Parse cell flags structurally so wall-clock LDT bytes cannot be
        // misidentified as IS_EXPIRING (0x02) flag bytes.
        let cell_flags = parse_complex_cell_flags(&buf);

        assert_eq!(
            cell_flags.len(),
            1,
            "MAP with 1 entry should produce 1 cell"
        );
        assert_eq!(
            cell_flags[0] & CELL_IS_EXPIRING,
            CELL_IS_EXPIRING,
            "MAP with TTL: cell should have IS_EXPIRING flag set, got flags byte: 0x{:02X}",
            cell_flags[0]
        );
        assert_eq!(
            cell_flags[0] & CELL_HAS_EMPTY_VALUE,
            0,
            "MAP with TTL: cell should NOT have HAS_EMPTY_VALUE, got flags byte: 0x{:02X}",
            cell_flags[0]
        );
    }

    #[test]
    fn test_list_complex_column_with_ttl() {
        // LIST with TTL should write IS_EXPIRING per cell, producing a larger
        // output than without TTL (extra timestamp/LDT/TTL delta fields).
        // Uses structural parsing to read cell flags at their exact byte positions,
        // avoiding false positives from time-derived LDT bytes.
        let stats = create_test_stats();
        let writer_ttl = DataWriter::new(stats.clone());
        let writer_no_ttl = DataWriter::new(stats);

        let column = Column {
            name: "items".to_string(),
            data_type: "list<int>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::List(vec![
            Value::Integer(1),
            Value::Integer(2),
            Value::Integer(3),
        ]);

        let mut buf_ttl = Vec::new();
        writer_ttl
            .write_complex_column(&mut buf_ttl, &column, &value, 1001000, Some(1800))
            .unwrap();

        let mut buf_no_ttl = Vec::new();
        writer_no_ttl
            .write_complex_column(&mut buf_no_ttl, &column, &value, 1001000, None)
            .unwrap();

        // TTL version must be larger: each cell gets timestamp + LDT + TTL deltas
        // instead of just USE_ROW_TIMESTAMP flag.
        assert!(
            buf_ttl.len() > buf_no_ttl.len(),
            "LIST with TTL ({} bytes) should be larger than without TTL ({} bytes)",
            buf_ttl.len(),
            buf_no_ttl.len()
        );

        // Structurally verify IS_EXPIRING is set on every cell in the TTL version.
        let cell_flags_ttl = parse_complex_cell_flags(&buf_ttl);
        assert_eq!(
            cell_flags_ttl.len(),
            3,
            "LIST with 3 elements should produce 3 cells"
        );
        assert!(
            cell_flags_ttl.iter().all(|&f| (f & CELL_IS_EXPIRING) != 0),
            "LIST with TTL: all cells should have IS_EXPIRING flag set, got: {:?}",
            cell_flags_ttl
        );

        // Verify the no-TTL version uses USE_ROW_TIMESTAMP instead.
        let cell_flags_no_ttl = parse_complex_cell_flags(&buf_no_ttl);
        assert_eq!(cell_flags_no_ttl.len(), 3);
        assert!(
            cell_flags_no_ttl
                .iter()
                .all(|&f| (f & CELL_IS_EXPIRING) == 0),
            "LIST without TTL: no cells should have IS_EXPIRING flag, got: {:?}",
            cell_flags_no_ttl
        );
    }

    #[test]
    fn test_complex_column_no_ttl_uses_row_timestamp() {
        // Regression: without TTL, cells should still use USE_ROW_TIMESTAMP
        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        let column = Column {
            name: "tags".to_string(),
            data_type: "set<text>".to_string(),
            nullable: true,
            default: None,
            is_static: false,
        };

        let value = Value::Set(vec![Value::Text("x".to_string())]);

        let mut buf = Vec::new();
        writer
            .write_complex_column(&mut buf, &column, &value, 1001000, None)
            .unwrap();

        // Without TTL: USE_ROW_TIMESTAMP | HAS_EMPTY_VALUE = 0x0C
        let expected_flags = CELL_USE_ROW_TIMESTAMP | CELL_HAS_EMPTY_VALUE;
        let count = buf.iter().filter(|&&b| b == expected_flags).count();
        assert_eq!(
            count, 1,
            "Without TTL, SET cells should use USE_ROW_TIMESTAMP"
        );
    }

    #[test]
    fn test_bitmap_includes_deleted_columns() {
        // Delete operations should mark columns as present in the bitmap
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![
                Column {
                    name: "age".to_string(),
                    data_type: "int".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "name".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Write "name" and delete "age"
        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![
                CellOperation::Delete {
                    column: "age".to_string(),
                },
                CellOperation::Write {
                    column: "name".to_string(),
                    value: Value::Text("Alice".to_string()),
                },
            ],
            1001000,
            None,
        );

        // Write bitmap — both columns should be present (bitmap = 0)
        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // bitmap = 0 means all columns present (no MISSING bits set)
        // Since we have 2 regular columns and both are in operations,
        // all should be marked present
        assert_eq!(buf.len(), 1, "Bitmap should be a single byte");
        assert_eq!(
            buf[0], 0,
            "Bitmap should be 0 (all columns present) when both write and delete cover all columns"
        );
    }

    #[test]
    fn test_bitmap_delete_only_column_is_present() {
        // A column that ONLY has a Delete should still be marked present
        let schema = TableSchema {
            keyspace: "test_ks".to_string(),
            table: "test_table".to_string(),
            partition_keys: vec![KeyColumn {
                name: "id".to_string(),
                data_type: "int".to_string(),
                position: 0,
            }],
            clustering_keys: vec![],
            columns: vec![
                Column {
                    name: "age".to_string(),
                    data_type: "int".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
                Column {
                    name: "name".to_string(),
                    data_type: "text".to_string(),
                    nullable: true,
                    default: None,
                    is_static: false,
                },
            ],
            comments: HashMap::new(),
        };

        let stats = create_test_stats();
        let writer = DataWriter::new(stats);

        // Only delete "age", don't write "name"
        let table_id = TableId::new("test_ks", "test_table");
        let pk = PartitionKey::single("id", Value::Integer(1));
        let mutation = Mutation::new(
            table_id,
            pk,
            None,
            vec![CellOperation::Delete {
                column: "age".to_string(),
            }],
            1001000,
            None,
        );

        let mut buf = Vec::new();
        writer
            .write_column_bitmap(&mut buf, &mutation, &schema)
            .unwrap();

        // Regular columns sorted alphabetically: [age, name]
        // age (idx 0) = present (Delete), name (idx 1) = missing
        // bitmap bit 1 = 1, bit 0 = 0 → bitmap = 0b10 = 2
        assert_eq!(buf.len(), 1);
        assert_eq!(
            buf[0], 2,
            "Bitmap should mark 'name' as missing (bit 1) but 'age' as present (bit 0)"
        );
    }

    // ========== Issue #492: streaming DataWriter tests ==========

    /// Build a deterministic set of partitions used by the streaming tests.
    fn streaming_test_partitions() -> Vec<(DecoratedKey, Vec<Mutation>)> {
        let table_id = TableId::new("test_ks", "test_table");
        (0..16u32)
            .map(|i| {
                let key = DecoratedKey::new(i as i64, i.to_be_bytes().to_vec());
                let pk = PartitionKey::single("id", Value::Integer(i as i32));
                let mutation = Mutation::new(
                    table_id.clone(),
                    pk,
                    None,
                    vec![CellOperation::Write {
                        column: "name".to_string(),
                        value: Value::Text(format!("partition-{i}")),
                    }],
                    1_001_000 + i as i64,
                    None,
                );
                (key, vec![mutation])
            })
            .collect()
    }

    /// Byte-identical guard (Issue #492): the streaming writer (flushing each
    /// partition to a file) must produce a Data.db byte sequence that is
    /// identical to the legacy in-memory writer, and the returned partition
    /// offsets must match exactly. Anything else breaks Index.db offsets.
    #[test]
    fn test_streaming_writer_byte_identical_to_in_memory() {
        let schema = create_test_schema();
        let partitions = streaming_test_partitions();

        // In-memory reference: accumulate every partition in `buffer`.
        let mut mem_writer = DataWriter::new(create_test_stats());
        let mut mem_offsets = Vec::new();
        for (key, mutations) in &partitions {
            mem_offsets.push(
                mem_writer
                    .write_partition(key, mutations, &schema, None, &[])
                    .unwrap(),
            );
        }
        let expected_bytes = mem_writer.finish().unwrap();

        // Streaming: flush each partition to a temp Data.db file.
        let dir = tempfile::tempdir().unwrap();
        let data_path = dir.path().join("nb-1-big-Data.db");
        let mut stream_writer = DataWriter::with_sink(create_test_stats(), data_path.clone());
        let mut stream_offsets = Vec::new();
        for (key, mutations) in &partitions {
            stream_offsets.push(
                stream_writer
                    .write_partition(key, mutations, &schema, None, &[])
                    .unwrap(),
            );
        }
        let data_size = stream_writer.finish_streaming().unwrap();

        // Offsets returned to the caller (fed to Index.db) must be identical.
        assert_eq!(
            stream_offsets, mem_offsets,
            "streaming partition offsets must equal in-memory offsets"
        );

        // The on-disk Data.db must be byte-for-byte identical to the in-memory
        // bytes, and the reported data_size must match the file length.
        let on_disk = std::fs::read(&data_path).unwrap();
        assert_eq!(
            on_disk, expected_bytes,
            "streamed Data.db must be byte-identical to in-memory Data.db"
        );
        assert_eq!(
            data_size as usize,
            expected_bytes.len(),
            "finish_streaming() data_size must equal file length"
        );

        // Every returned offset must point at the actual start byte in the file:
        // a partition starts with its 2-byte key length, here always 0x0004.
        for &off in &stream_offsets {
            assert_eq!(
                &on_disk[off as usize..off as usize + 2],
                &[0x00, 0x04],
                "offset {off} must land on a partition's key-length prefix"
            );
        }
    }

    /// Bounded-memory evidence (Issue #492): after each `write_partition` the
    /// scratch buffer must hold only the most recent partition, while the
    /// flushed `position` grows monotonically. This is the proof that peak heap
    /// is O(largest partition) rather than O(file).
    #[test]
    fn test_streaming_writer_bounds_memory_to_one_partition() {
        let schema = create_test_schema();
        let partitions = streaming_test_partitions();

        let dir = tempfile::tempdir().unwrap();
        let data_path = dir.path().join("nb-1-big-Data.db");
        let mut writer = DataWriter::with_sink(create_test_stats(), data_path);

        let mut prev_flushed = 0u64;
        // Tracks the largest single-partition flushed size. Because the scratch is
        // cleared after every partition (asserted below), peak resident Data.db
        // bytes are bounded by this value, not the whole file.
        let mut max_partition_size = 0usize;
        for (i, (key, mutations)) in partitions.iter().enumerate() {
            let flushed_before = writer.flushed_position();
            writer
                .write_partition(key, mutations, &schema, None, &[])
                .unwrap();

            // After a partition is written it has been flushed and the scratch
            // cleared: the scratch must be empty, never accumulating prior
            // partitions.
            assert_eq!(
                writer.scratch_len(),
                0,
                "scratch must be cleared after partition {i} (bounded memory)"
            );

            // Flushed bytes must strictly increase by this partition's size.
            let flushed_after = writer.flushed_position();
            assert!(
                flushed_after > flushed_before,
                "flushed position must grow after writing partition {i}"
            );
            let this_partition_size = (flushed_after - flushed_before) as usize;
            max_partition_size = max_partition_size.max(this_partition_size);
            assert!(flushed_after > prev_flushed);
            prev_flushed = flushed_after;
        }

        let total = writer.finish_streaming().unwrap();
        assert_eq!(
            total, prev_flushed,
            "total size must equal last flushed pos"
        );

        // Peak resident bytes were bounded by the largest single partition,
        // which is far smaller than the whole file for many partitions.
        assert!(
            (max_partition_size as u64) < total,
            "largest single partition ({max_partition_size}) must be smaller than the full file ({total})"
        );
    }
}