lucisearch 0.8.0

//! Posting list encoding and decoding.
//!
//! Encodes sorted `(DocId, term_frequency)` pairs using delta-encoded doc IDs
//! and VByte encoding. Format:
//!
//! ```text
//! [num_docs: u32] [delta_0: vbyte] [tf_0: vbyte] [delta_1: vbyte] [tf_1: vbyte] ...
//! ```
//!
//! See [[inverted-index]] and [[architecture-overview#Step 1]].

use crate::core::DocId;

// --- VByte encoding ---

/// Encode a u32 as a variable-length byte sequence.
/// Uses 1-5 bytes. Each byte stores 7 data bits; the high bit indicates
/// continuation (1 = more bytes follow, 0 = last byte).
fn encode_vbyte(mut value: u32, out: &mut Vec<u8>) {
    loop {
        let byte = (value & 0x7F) as u8;
        value >>= 7;
        if value == 0 {
            out.push(byte); // high bit clear = last byte
            break;
        }
        out.push(byte | 0x80); // high bit set = more bytes follow
    }
}

/// Decode a VByte-encoded u32 from the given position. Returns (value, bytes_consumed).
fn decode_vbyte(data: &[u8], pos: usize) -> (u32, usize) {
    let mut result: u32 = 0;
    let mut shift = 0;
    let mut i = pos;
    loop {
        let byte = data[i];
        result |= ((byte & 0x7F) as u32) << shift;
        i += 1;
        if byte & 0x80 == 0 {
            break;
        }
        shift += 7;
    }
    (result, i - pos)
}

// --- PostingListWriter ---

/// Builds a posting list from sorted (doc_id, tf) pairs.
///
/// Callers must add documents in strictly increasing doc_id order.
pub struct PostingListWriter {
    buf: Vec<u8>,
    count: u32,
    last_doc_id: u32,
}

impl PostingListWriter {
    pub fn new() -> Self {
        Self {
            buf: Vec::new(),
            count: 0,
            last_doc_id: 0,
        }
    }

    /// Add a posting. `doc_id` must be strictly greater than the previous one.
    pub fn add(&mut self, doc_id: DocId, tf: u32) {
        let id = doc_id.as_u32();
        let delta = if self.count == 0 {
            id
        } else {
            debug_assert!(id > self.last_doc_id, "doc IDs must be strictly increasing");
            id - self.last_doc_id
        };

        encode_vbyte(delta, &mut self.buf);
        encode_vbyte(tf, &mut self.buf);

        self.last_doc_id = id;
        self.count += 1;
    }

    /// Finalize and return the encoded posting list bytes.
    pub fn finish(self) -> Vec<u8> {
        let mut result = Vec::with_capacity(5 + self.buf.len());
        result.extend_from_slice(&self.count.to_le_bytes());
        result.push(0x00); // flags: no positions
        result.extend_from_slice(&self.buf);
        result
    }
}

impl Default for PostingListWriter {
    fn default() -> Self {
        Self::new()
    }
}

// --- PostingListReader ---

/// Iterates over a posting list, yielding (DocId, term_frequency) pairs.
///
/// Handles both the original format and the position-aware format. When
/// reading position-aware data, positions are skipped automatically.
pub struct PostingListReader<'a> {
    data: &'a [u8],
    pos: usize,
    remaining: u32,
    current_doc_id: u32,
    has_positions: bool,
}

impl<'a> PostingListReader<'a> {
    /// Create a reader from encoded posting list bytes.
    ///
    /// Handles both the original format (no positions) and the position-aware
    /// format (with flags byte). When reading position-aware data, positions
    /// are skipped — only doc_id and tf are returned. Use
    /// `PositionPostingListReader` to read positions.
    pub fn new(data: &'a [u8]) -> Self {
        let count = if data.len() >= 5 {
            u32::from_le_bytes([data[0], data[1], data[2], data[3]])
        } else {
            0
        };
        let has_pos = has_positions(data);

        // Determine start of posting data based on format
        let pos = if data.len() >= 5 && data[4] == FLAG_BLOCK_MAX {
            // Block-max format: skip [num_docs(4)][flags(1)][num_blocks(2)][headers]
            let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap()) as usize;
            7 + num_blocks * 8 // skip block headers (8 bytes each)
        } else {
            5 // skip num_docs (4) + flags (1)
        };

        Self {
            data,
            pos,
            remaining: count,
            current_doc_id: 0,
            has_positions: has_pos,
        }
    }

    /// Number of postings in this list.
    pub fn len(&self) -> u32 {
        if self.data.len() >= 5 {
            u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
        } else {
            0
        }
    }

    /// Whether the posting list is empty.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Read the next (DocId, tf) pair, or None if exhausted.
    /// If the underlying data has positions, they are skipped.
    pub fn next(&mut self) -> Option<(DocId, u32)> {
        if self.remaining == 0 {
            return None;
        }

        let (delta, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        let (tf, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        // Skip positions if present
        if self.has_positions {
            for _ in 0..tf {
                let (_, consumed) = decode_vbyte(self.data, self.pos);
                self.pos += consumed;
            }
        }

        self.current_doc_id += delta;
        self.remaining -= 1;

        Some((DocId(self.current_doc_id), tf))
    }
}

// --- Position-aware posting list ---

/// Builds a posting list that includes term positions (for phrase queries).
///
/// Format:
/// ```text
/// [num_docs: u32] [flags: u8 = 0x01]
/// Per doc: [delta: vbyte] [tf: vbyte] [pos_0: vbyte] [pos_delta_1: vbyte] ...
/// ```
///
/// Positions are delta-encoded within each document.
pub struct PositionPostingListWriter {
    buf: Vec<u8>,
    count: u32,
    last_doc_id: u32,
}

/// Flags byte values.
const FLAG_HAS_POSITIONS: u8 = 0x01;
const FLAG_BLOCK_MAX: u8 = 0x02;

/// Block size for block-max posting lists (matches Lucene convention).
const BLOCK_SIZE: usize = 128;

impl PositionPostingListWriter {
    pub fn new() -> Self {
        Self {
            buf: Vec::new(),
            count: 0,
            last_doc_id: 0,
        }
    }

    /// Add a posting with positions. `doc_id` must be strictly increasing.
    /// `positions` must be sorted ascending.
    pub fn add(&mut self, doc_id: DocId, positions: &[u32]) {
        let id = doc_id.as_u32();
        let delta = if self.count == 0 {
            id
        } else {
            debug_assert!(id > self.last_doc_id);
            id - self.last_doc_id
        };

        let tf = positions.len() as u32;
        encode_vbyte(delta, &mut self.buf);
        encode_vbyte(tf, &mut self.buf);

        // Delta-encode positions
        let mut last_pos = 0u32;
        for &pos in positions {
            encode_vbyte(pos - last_pos, &mut self.buf);
            last_pos = pos;
        }

        self.last_doc_id = id;
        self.count += 1;
    }

    pub fn finish(self) -> Vec<u8> {
        let mut result = Vec::with_capacity(5 + self.buf.len());
        result.extend_from_slice(&self.count.to_le_bytes());
        result.push(FLAG_HAS_POSITIONS);
        result.extend_from_slice(&self.buf);
        result
    }
}

impl Default for PositionPostingListWriter {
    fn default() -> Self {
        Self::new()
    }
}

/// Iterates over a position-aware posting list.
///
/// Uses a reusable internal buffer for positions to avoid per-document
/// heap allocation. Supports `advance()` to skip intermediate documents
/// without decoding their position data.
pub struct PositionPostingListReader<'a> {
    data: &'a [u8],
    pos: usize,
    remaining: u32,
    current_doc_id: u32,
    /// Reusable position buffer — cleared and refilled on each next/advance.
    position_buf: Vec<u32>,
    /// Cached first position and TF for the fast TF=1 path.
    cached_first_pos: u32,
    cached_tf: u32,
}

impl<'a> PositionPostingListReader<'a> {
    /// Create a reader from position-aware posting list bytes.
    pub fn new(data: &'a [u8]) -> Self {
        let count = if data.len() >= 5 {
            u32::from_le_bytes([data[0], data[1], data[2], data[3]])
        } else {
            0
        };
        Self {
            data,
            pos: 5, // skip num_docs (4) + flags (1)
            remaining: count,
            current_doc_id: 0,
            position_buf: Vec::new(),
            cached_first_pos: 0,
            cached_tf: 0,
        }
    }

    pub fn len(&self) -> u32 {
        if self.data.len() >= 4 {
            u32::from_le_bytes([self.data[0], self.data[1], self.data[2], self.data[3]])
        } else {
            0
        }
    }

    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Current document's positions (valid after next() or advance()).
    pub fn positions(&self) -> &[u32] {
        &self.position_buf
    }

    /// The first position for the current document. For TF=1 docs (common
    /// in phrase queries), this is the only position. Uses cached value,
    /// no Vec access.
    #[inline(always)]
    pub fn first_position(&self) -> u32 {
        self.cached_first_pos
    }

    /// Term frequency of the current document (valid after next/advance).
    #[inline(always)]
    pub fn current_tf(&self) -> u32 {
        self.cached_tf
    }

    /// Decode positions for the current document into the internal buffer.
    /// For TF=1, only caches the single position (no Vec push).
    fn decode_positions(&mut self, tf: u32) {
        self.cached_tf = tf;
        let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;
        self.cached_first_pos = first_delta;

        if tf == 1 {
            // Fast path: single position, no Vec needed
            self.position_buf.clear();
            self.position_buf.push(first_delta);
        } else {
            // General path: decode remaining positions into Vec
            self.position_buf.clear();
            self.position_buf.push(first_delta);
            let mut last_pos = first_delta;
            for _ in 1..tf {
                let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
                self.pos += consumed;
                last_pos += pos_delta;
                self.position_buf.push(last_pos);
            }
        }
    }

    /// Skip positions for a document without decoding values.
    fn skip_positions(&mut self, tf: u32) {
        for _ in 0..tf {
            let (_, consumed) = decode_vbyte(self.data, self.pos);
            self.pos += consumed;
        }
    }

    /// Read the next posting with positions.
    pub fn next(&mut self) -> Option<(DocId, Vec<u32>)> {
        if self.remaining == 0 {
            return None;
        }

        let (delta, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        let (tf, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        self.current_doc_id += delta;
        self.decode_positions(tf);
        self.remaining -= 1;

        Some((DocId(self.current_doc_id), self.position_buf.clone()))
    }

    /// Advance to the first document >= target, decoding its positions.
    /// Skips intermediate documents without decoding their position data.
    /// Returns the doc_id if found, or None if exhausted.
    /// After calling, use `positions()` to access the decoded positions.
    pub fn advance(&mut self, target: DocId) -> Option<DocId> {
        let target_val = target.as_u32();
        while self.remaining > 0 {
            let (delta, consumed) = decode_vbyte(self.data, self.pos);
            self.pos += consumed;

            let (tf, consumed) = decode_vbyte(self.data, self.pos);
            self.pos += consumed;

            self.current_doc_id += delta;
            self.remaining -= 1;

            if self.current_doc_id >= target_val {
                self.decode_positions(tf);
                return Some(DocId(self.current_doc_id));
            }

            // Skip positions without decoding
            self.skip_positions(tf);
        }
        None
    }

    /// Advance to the next document sequentially (no target, no skip check).
    /// Faster than advance() when iterating all docs in order.
    /// For TF=1 (most common), skips Vec operations entirely.
    #[inline(always)]
    pub fn next_doc(&mut self) -> Option<DocId> {
        if self.remaining == 0 {
            return None;
        }

        let (delta, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        let (tf, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;

        self.current_doc_id += delta;
        self.cached_tf = tf;

        // Decode first position (always needed)
        let (first_delta, consumed) = decode_vbyte(self.data, self.pos);
        self.pos += consumed;
        self.cached_first_pos = first_delta;

        if tf > 1 {
            // Rare: multi-position doc, decode remaining into Vec
            self.position_buf.clear();
            self.position_buf.push(first_delta);
            let mut last_pos = first_delta;
            for _ in 1..tf {
                let (pos_delta, consumed) = decode_vbyte(self.data, self.pos);
                self.pos += consumed;
                last_pos += pos_delta;
                self.position_buf.push(last_pos);
            }
        }
        // For TF=1, skip Vec entirely — use cached_first_pos

        self.remaining -= 1;
        Some(DocId(self.current_doc_id))
    }

    /// Current document ID (valid after next() or advance()).
    pub fn current_doc_id(&self) -> u32 {
        self.current_doc_id
    }
}

// --- Block-Max posting list ---

/// Builds a posting list with per-block max TF metadata for WAND optimization.
///
/// Format:
/// ```text
/// [num_docs: u32] [flags: u8 = 0x02] [num_blocks: u16]
/// [block_headers: (last_doc_id: u32, max_tf: u16, data_len: u16) × num_blocks]
/// [block_data: delta-encoded doc_ids + TFs per block]
/// ```
///
/// See [[architecture-query-execution#WAND / MaxScore Optimization]].
pub struct BlockMaxPostingListWriter {
    entries: Vec<(u32, u32)>, // (doc_id, tf)
}

impl BlockMaxPostingListWriter {
    pub fn new() -> Self {
        Self {
            entries: Vec::new(),
        }
    }

    pub fn add(&mut self, doc_id: DocId, tf: u32) {
        self.entries.push((doc_id.as_u32(), tf));
    }

    pub fn finish(self) -> Vec<u8> {
        let num_docs = self.entries.len() as u32;
        let num_blocks = if self.entries.is_empty() {
            0u16
        } else {
            ((self.entries.len() + BLOCK_SIZE - 1) / BLOCK_SIZE) as u16
        };

        // Build block data and headers
        let mut block_headers: Vec<(u32, u16, u16)> = Vec::with_capacity(num_blocks as usize);
        let mut block_data_bufs: Vec<Vec<u8>> = Vec::with_capacity(num_blocks as usize);

        for block_idx in 0..num_blocks as usize {
            let start = block_idx * BLOCK_SIZE;
            let end = ((block_idx + 1) * BLOCK_SIZE).min(self.entries.len());
            let block_entries = &self.entries[start..end];

            let last_doc_id = block_entries.last().unwrap().0;
            let max_tf = block_entries.iter().map(|e| e.1).max().unwrap();
            let max_tf_u16 = if max_tf > u16::MAX as u32 {
                u16::MAX
            } else {
                max_tf as u16
            };

            // Delta-encode within block (delta from previous block's last doc, or 0)
            let mut buf = Vec::new();
            let base_doc_id = if block_idx == 0 {
                0u32
            } else {
                self.entries[start - 1].0
            };
            let mut prev = base_doc_id;
            for &(doc_id, tf) in block_entries {
                encode_vbyte(doc_id - prev, &mut buf);
                encode_vbyte(tf, &mut buf);
                prev = doc_id;
            }

            let data_len = buf.len() as u16;
            block_headers.push((last_doc_id, max_tf_u16, data_len));
            block_data_bufs.push(buf);
        }

        // Assemble final output
        let header_bytes = num_blocks as usize * 8;
        let data_bytes: usize = block_data_bufs.iter().map(|b| b.len()).sum();
        let mut result = Vec::with_capacity(4 + 1 + 2 + header_bytes + data_bytes);

        result.extend_from_slice(&num_docs.to_le_bytes());
        result.push(FLAG_BLOCK_MAX);
        result.extend_from_slice(&num_blocks.to_le_bytes());

        // Block headers
        for &(last_doc_id, max_tf, data_len) in &block_headers {
            result.extend_from_slice(&last_doc_id.to_le_bytes());
            result.extend_from_slice(&max_tf.to_le_bytes());
            result.extend_from_slice(&data_len.to_le_bytes());
        }

        // Block data
        for buf in block_data_bufs {
            result.extend_from_slice(&buf);
        }

        result
    }
}

impl Default for BlockMaxPostingListWriter {
    fn default() -> Self {
        Self::new()
    }
}

/// Iterates over a block-max posting list, yielding (DocId, TF) pairs.
///
/// Supports block-level skipping via `advance_to_block()` for WAND optimization.
pub struct BlockMaxPostingListReader<'a> {
    data: &'a [u8],
    num_docs: u32,
    num_blocks: u16,
    headers_start: usize,

    // Per-block cumulative data offsets (computed once at construction)
    block_data_offsets: Vec<usize>,

    // Iteration state
    current_block: u16,
    pos_in_data: usize,
    remaining_in_block: u16,
    current_doc_id: u32,
    total_remaining: u32,
}

impl<'a> BlockMaxPostingListReader<'a> {
    pub fn new(data: &'a [u8]) -> Self {
        let num_docs = u32::from_le_bytes(data[0..4].try_into().unwrap());
        debug_assert_eq!(data[4], FLAG_BLOCK_MAX);
        let num_blocks = u16::from_le_bytes(data[5..7].try_into().unwrap());

        let headers_start = 7;
        let block_data_start = headers_start + num_blocks as usize * 8;

        // Precompute cumulative block data offsets
        let mut block_data_offsets = Vec::with_capacity(num_blocks as usize + 1);
        let mut offset = block_data_start;
        for i in 0..num_blocks as usize {
            block_data_offsets.push(offset);
            let hdr_pos = headers_start + i * 8;
            let data_len =
                u16::from_le_bytes(data[hdr_pos + 6..hdr_pos + 8].try_into().unwrap()) as usize;
            offset += data_len;
        }
        block_data_offsets.push(offset); // sentinel

        let first_block_docs = if num_blocks > 0 {
            let total = num_docs as usize;
            let _full_blocks = if num_blocks > 1 {
                (num_blocks as usize - 1) * BLOCK_SIZE
            } else {
                0
            };
            if num_blocks == 1 {
                total as u16
            } else {
                BLOCK_SIZE as u16
            }
        } else {
            0
        };

        Self {
            data,
            num_docs,
            num_blocks,
            headers_start,
            block_data_offsets,
            current_block: 0,
            pos_in_data: block_data_start,
            remaining_in_block: first_block_docs,
            current_doc_id: 0,
            total_remaining: num_docs,
        }
    }

    /// Number of postings in this list.
    pub fn len(&self) -> u32 {
        self.num_docs
    }

    pub fn is_empty(&self) -> bool {
        self.num_docs == 0
    }

    pub fn num_blocks(&self) -> u16 {
        self.num_blocks
    }

    /// Max TF for a given block.
    pub fn block_max_tf(&self, block: u16) -> u16 {
        let hdr_pos = self.headers_start + block as usize * 8;
        u16::from_le_bytes(self.data[hdr_pos + 4..hdr_pos + 6].try_into().unwrap())
    }

    /// Last doc ID in a given block.
    pub fn block_last_doc(&self, block: u16) -> u32 {
        let hdr_pos = self.headers_start + block as usize * 8;
        u32::from_le_bytes(self.data[hdr_pos..hdr_pos + 4].try_into().unwrap())
    }

    /// Number of docs in a given block.
    fn block_doc_count(&self, block: u16) -> u16 {
        if (block as usize) < self.num_blocks as usize - 1 {
            BLOCK_SIZE as u16
        } else {
            // Last block: remaining docs
            let full_blocks = (self.num_blocks as usize - 1) * BLOCK_SIZE;
            (self.num_docs as usize - full_blocks) as u16
        }
    }

    /// Position the block pointer at the block containing `target` without
    /// decoding individual postings. Used by WAND to read `block_max_tf()`
    /// for the correct block.
    ///
    /// This only updates `current_block` — it does NOT advance the doc
    /// iteration state. Call `advance_to_block()` for full block skipping.
    pub fn advance_shallow(&mut self, target: DocId) {
        let target_val = target.as_u32();
        // Binary search over block headers
        let mut lo = self.current_block as usize;
        let mut hi = self.num_blocks as usize;
        while lo < hi {
            let mid = lo + (hi - lo) / 2;
            if self.block_last_doc(mid as u16) < target_val {
                lo = mid + 1;
            } else {
                hi = mid;
            }
        }
        if lo < self.num_blocks as usize {
            self.current_block = lo as u16;
        }
    }

    /// Skip to the first block whose last_doc_id >= target.
    /// Positions the reader at the start of that block.
    pub fn advance_to_block(&mut self, target: DocId) {
        let target_val = target.as_u32();

        // Binary search over block headers
        let mut lo = self.current_block as usize;
        let mut hi = self.num_blocks as usize;
        while lo < hi {
            let mid = lo + (hi - lo) / 2;
            if self.block_last_doc(mid as u16) < target_val {
                lo = mid + 1;
            } else {
                hi = mid;
            }
        }

        if lo >= self.num_blocks as usize {
            // Past all blocks
            self.total_remaining = 0;
            self.remaining_in_block = 0;
            return;
        }

        // Skip to block `lo`
        self.seek_to_block(lo as u16);
    }

    /// Position reader at the start of a specific block.
    fn seek_to_block(&mut self, block: u16) {
        if block >= self.num_blocks {
            self.total_remaining = 0;
            self.remaining_in_block = 0;
            return;
        }

        // Compute docs remaining from this block onward
        let docs_before_block = block as usize * BLOCK_SIZE;
        self.total_remaining = self.num_docs.saturating_sub(docs_before_block as u32);
        self.remaining_in_block = self.block_doc_count(block);
        self.current_block = block;
        self.pos_in_data = self.block_data_offsets[block as usize];

        // Set current_doc_id to the end of the previous block
        if block == 0 {
            self.current_doc_id = 0;
        } else {
            self.current_doc_id = self.block_last_doc(block - 1);
        }
    }

    /// Read the next (DocId, TF) pair, or None if exhausted.
    pub fn next(&mut self) -> Option<(DocId, u32)> {
        if self.total_remaining == 0 {
            return None;
        }

        // Move to next block if current block is exhausted
        if self.remaining_in_block == 0 {
            let next_block = self.current_block + 1;
            if next_block >= self.num_blocks {
                self.total_remaining = 0;
                return None;
            }
            self.current_block = next_block;
            self.remaining_in_block = self.block_doc_count(next_block);
            self.pos_in_data = self.block_data_offsets[next_block as usize];
        }

        let (delta, consumed) = decode_vbyte(self.data, self.pos_in_data);
        self.pos_in_data += consumed;
        let (tf, consumed) = decode_vbyte(self.data, self.pos_in_data);
        self.pos_in_data += consumed;

        self.current_doc_id += delta;
        self.remaining_in_block -= 1;
        self.total_remaining -= 1;

        Some((DocId(self.current_doc_id), tf))
    }

    /// Which block is currently being read.
    pub fn current_block_idx(&self) -> u16 {
        self.current_block
    }
}

/// Check whether a posting list uses the block-max format.
pub fn has_block_max(data: &[u8]) -> bool {
    data.len() >= 5 && data[4] == FLAG_BLOCK_MAX
}

/// Check whether a posting list byte slice has positions encoded.
pub fn has_positions(data: &[u8]) -> bool {
    data.len() >= 5 && data[4] == FLAG_HAS_POSITIONS
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn vbyte_single_byte() {
        // Values 0-127 fit in one byte
        let mut buf = Vec::new();
        encode_vbyte(0, &mut buf);
        assert_eq!(buf.len(), 1);
        let (val, consumed) = decode_vbyte(&buf, 0);
        assert_eq!(val, 0);
        assert_eq!(consumed, 1);
    }

    #[test]
    fn vbyte_boundary_127() {
        let mut buf = Vec::new();
        encode_vbyte(127, &mut buf);
        assert_eq!(buf.len(), 1);
        let (val, _) = decode_vbyte(&buf, 0);
        assert_eq!(val, 127);
    }

    #[test]
    fn vbyte_boundary_128() {
        let mut buf = Vec::new();
        encode_vbyte(128, &mut buf);
        assert_eq!(buf.len(), 2);
        let (val, consumed) = decode_vbyte(&buf, 0);
        assert_eq!(val, 128);
        assert_eq!(consumed, 2);
    }

    #[test]
    fn vbyte_16383() {
        let mut buf = Vec::new();
        encode_vbyte(16383, &mut buf);
        assert_eq!(buf.len(), 2);
        let (val, _) = decode_vbyte(&buf, 0);
        assert_eq!(val, 16383);
    }

    #[test]
    fn vbyte_16384() {
        let mut buf = Vec::new();
        encode_vbyte(16384, &mut buf);
        assert_eq!(buf.len(), 3);
        let (val, _) = decode_vbyte(&buf, 0);
        assert_eq!(val, 16384);
    }

    #[test]
    fn vbyte_large_value() {
        let mut buf = Vec::new();
        let val = u32::MAX - 1;
        encode_vbyte(val, &mut buf);
        assert_eq!(buf.len(), 5);
        let (decoded, _) = decode_vbyte(&buf, 0);
        assert_eq!(decoded, val);
    }

    #[test]
    fn vbyte_max_value() {
        let mut buf = Vec::new();
        encode_vbyte(u32::MAX, &mut buf);
        let (decoded, _) = decode_vbyte(&buf, 0);
        assert_eq!(decoded, u32::MAX);
    }

    #[test]
    fn round_trip_single_doc() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(5), 3);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.len(), 1);
        assert_eq!(reader.next(), Some((DocId(5), 3)));
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn round_trip_multiple_docs() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(1), 2);
        writer.add(DocId(5), 1);
        writer.add(DocId(100), 4);
        writer.add(DocId(1000), 1);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.len(), 4);
        assert_eq!(reader.next(), Some((DocId(1), 2)));
        assert_eq!(reader.next(), Some((DocId(5), 1)));
        assert_eq!(reader.next(), Some((DocId(100), 4)));
        assert_eq!(reader.next(), Some((DocId(1000), 1)));
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn round_trip_consecutive_doc_ids() {
        let mut writer = PostingListWriter::new();
        for i in 0..10 {
            writer.add(DocId(i), 1);
        }
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.len(), 10);
        for i in 0..10 {
            assert_eq!(reader.next(), Some((DocId(i), 1)));
        }
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn round_trip_varying_tf() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(0), 0);
        writer.add(DocId(1), 1);
        writer.add(DocId(2), 127);
        writer.add(DocId(3), 128);
        writer.add(DocId(4), 10000);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.next(), Some((DocId(0), 0)));
        assert_eq!(reader.next(), Some((DocId(1), 1)));
        assert_eq!(reader.next(), Some((DocId(2), 127)));
        assert_eq!(reader.next(), Some((DocId(3), 128)));
        assert_eq!(reader.next(), Some((DocId(4), 10000)));
    }

    #[test]
    fn empty_posting_list() {
        let writer = PostingListWriter::new();
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.len(), 0);
        assert!(reader.is_empty());
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn large_posting_list() {
        let count = 10_000;
        let mut writer = PostingListWriter::new();
        for i in 0..count {
            writer.add(DocId(i * 3), (i % 10) + 1);
        }
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.len(), count);
        for i in 0..count {
            let (doc_id, tf) = reader.next().unwrap();
            assert_eq!(doc_id, DocId(i * 3));
            assert_eq!(tf, (i % 10) + 1);
        }
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn delta_encoding_compresses() {
        // Consecutive IDs should compress well (delta=1 each, 1 byte per delta).
        let mut writer = PostingListWriter::new();
        for i in 0..1000 {
            writer.add(DocId(i), 1);
        }
        let data = writer.finish();

        // 4 bytes header + ~1 byte per delta + ~1 byte per tf = ~2004 bytes.
        // Without delta encoding it would be ~5000+ bytes.
        assert!(data.len() < 3000);
    }

    #[test]
    fn wide_gap_doc_ids() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(0), 1);
        writer.add(DocId(1_000_000), 1);
        writer.add(DocId(2_000_000), 1);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.next(), Some((DocId(0), 1)));
        assert_eq!(reader.next(), Some((DocId(1_000_000), 1)));
        assert_eq!(reader.next(), Some((DocId(2_000_000), 1)));
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn iterator_exhaustion_is_stable() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(1), 1);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert!(reader.next().is_some());
        assert_eq!(reader.next(), None);
        assert_eq!(reader.next(), None); // calling again is safe
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn doc_id_starting_at_zero() {
        let mut writer = PostingListWriter::new();
        writer.add(DocId(0), 5);
        let data = writer.finish();

        let mut reader = PostingListReader::new(&data);
        assert_eq!(reader.next(), Some((DocId(0), 5)));
    }

    // --- Position-aware posting list tests ---

    #[test]
    fn position_round_trip_single_doc() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(0), &[0, 3, 7]);
        let data = writer.finish();

        assert!(has_positions(&data));
        let mut reader = PositionPostingListReader::new(&data);
        assert_eq!(reader.len(), 1);
        let (doc_id, positions) = reader.next().unwrap();
        assert_eq!(doc_id, DocId(0));
        assert_eq!(positions, vec![0, 3, 7]);
        assert!(reader.next().is_none());
    }

    #[test]
    fn position_round_trip_multiple_docs() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(1), &[0, 1]);
        writer.add(DocId(5), &[2, 5, 8]);
        writer.add(DocId(10), &[0]);
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        assert_eq!(reader.len(), 3);

        let (id, pos) = reader.next().unwrap();
        assert_eq!(id, DocId(1));
        assert_eq!(pos, vec![0, 1]);

        let (id, pos) = reader.next().unwrap();
        assert_eq!(id, DocId(5));
        assert_eq!(pos, vec![2, 5, 8]);

        let (id, pos) = reader.next().unwrap();
        assert_eq!(id, DocId(10));
        assert_eq!(pos, vec![0]);

        assert!(reader.next().is_none());
    }

    #[test]
    fn position_consecutive_positions() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(0), &[0, 1, 2, 3, 4]);
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        let (_, pos) = reader.next().unwrap();
        assert_eq!(pos, vec![0, 1, 2, 3, 4]);
    }

    #[test]
    fn position_gapped_positions() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(0), &[0, 100, 200, 5000]);
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        let (_, pos) = reader.next().unwrap();
        assert_eq!(pos, vec![0, 100, 200, 5000]);
    }

    #[test]
    fn position_empty_list() {
        let writer = PositionPostingListWriter::new();
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        assert_eq!(reader.len(), 0);
        assert!(reader.is_empty());
        assert!(reader.next().is_none());
    }

    #[test]
    fn position_single_position() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(42), &[7]);
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        let (id, pos) = reader.next().unwrap();
        assert_eq!(id, DocId(42));
        assert_eq!(pos, vec![7]);
    }

    #[test]
    fn position_many_docs() {
        let mut writer = PositionPostingListWriter::new();
        for i in 0..1000u32 {
            writer.add(DocId(i), &[i * 2, i * 2 + 1]);
        }
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        assert_eq!(reader.len(), 1000);
        for i in 0..1000u32 {
            let (id, pos) = reader.next().unwrap();
            assert_eq!(id, DocId(i));
            assert_eq!(pos, vec![i * 2, i * 2 + 1]);
        }
        assert!(reader.next().is_none());
    }

    #[test]
    fn has_positions_flag() {
        // Position-aware format has the flag
        let mut pw = PositionPostingListWriter::new();
        pw.add(DocId(0), &[0]);
        let pdata = pw.finish();
        assert!(has_positions(&pdata));

        // Original format does not
        let mut w = PostingListWriter::new();
        w.add(DocId(0), 1);
        let data = w.finish();
        assert!(!has_positions(&data));
    }

    #[test]
    fn position_exhaustion_stable() {
        let mut writer = PositionPostingListWriter::new();
        writer.add(DocId(0), &[0]);
        let data = writer.finish();

        let mut reader = PositionPostingListReader::new(&data);
        assert!(reader.next().is_some());
        assert!(reader.next().is_none());
        assert!(reader.next().is_none());
    }

    // --- Block-max posting list tests ---

    #[test]
    fn block_max_single_doc() {
        let mut writer = BlockMaxPostingListWriter::new();
        writer.add(DocId(5), 3);
        let data = writer.finish();

        assert!(has_block_max(&data));
        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.len(), 1);
        assert_eq!(reader.num_blocks(), 1);
        assert_eq!(reader.block_last_doc(0), 5);
        assert_eq!(reader.block_max_tf(0), 3);
        assert_eq!(reader.next(), Some((DocId(5), 3)));
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_under_block_size() {
        let mut writer = BlockMaxPostingListWriter::new();
        for i in 0..50 {
            writer.add(DocId(i * 2), (i % 5) + 1);
        }
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.len(), 50);
        assert_eq!(reader.num_blocks(), 1);
        assert_eq!(reader.block_last_doc(0), 98);
        assert_eq!(reader.block_max_tf(0), 5); // max of (i%5)+1

        for i in 0..50 {
            assert_eq!(reader.next(), Some((DocId(i * 2), (i % 5) + 1)));
        }
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_exact_block_size() {
        let mut writer = BlockMaxPostingListWriter::new();
        for i in 0..128 {
            writer.add(DocId(i), 1);
        }
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.len(), 128);
        assert_eq!(reader.num_blocks(), 1);
        assert_eq!(reader.block_last_doc(0), 127);

        for i in 0..128 {
            assert_eq!(reader.next(), Some((DocId(i), 1)));
        }
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_multi_block() {
        // 300 docs = 2 full blocks (128) + 1 partial (44)
        let mut writer = BlockMaxPostingListWriter::new();
        for i in 0..300u32 {
            writer.add(DocId(i), (i % 10) + 1);
        }
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.len(), 300);
        assert_eq!(reader.num_blocks(), 3);

        // Block 0: docs 0-127
        assert_eq!(reader.block_last_doc(0), 127);
        assert_eq!(reader.block_max_tf(0), 10); // max of (0..128 % 10) + 1

        // Block 1: docs 128-255
        assert_eq!(reader.block_last_doc(1), 255);

        // Block 2: docs 256-299
        assert_eq!(reader.block_last_doc(2), 299);

        // Full iteration
        for i in 0..300u32 {
            assert_eq!(reader.next(), Some((DocId(i), (i % 10) + 1)));
        }
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_advance_to_block() {
        let mut writer = BlockMaxPostingListWriter::new();
        for i in 0..300u32 {
            writer.add(DocId(i * 3), 1);
        }
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        // Block 0: docs 0..127*3 = 0..381
        // Block 1: docs 128*3..255*3 = 384..765
        // Block 2: docs 256*3..299*3 = 768..897

        // Advance to block containing doc >= 400
        reader.advance_to_block(DocId(400));
        // Block 1 has last_doc = 255*3 = 765, which is >= 400
        let (doc, _) = reader.next().unwrap();
        assert_eq!(doc, DocId(384)); // first doc of block 1

        // Advance to block containing doc >= 800
        reader.advance_to_block(DocId(800));
        let (doc, _) = reader.next().unwrap();
        assert_eq!(doc, DocId(768)); // first doc of block 2
    }

    #[test]
    fn block_max_advance_past_end() {
        let mut writer = BlockMaxPostingListWriter::new();
        for i in 0..10 {
            writer.add(DocId(i), 1);
        }
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        reader.advance_to_block(DocId(100));
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_empty() {
        let writer = BlockMaxPostingListWriter::new();
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.len(), 0);
        assert!(reader.is_empty());
        assert_eq!(reader.num_blocks(), 0);
        assert_eq!(reader.next(), None);
    }

    #[test]
    fn block_max_large_tf_clamped() {
        let mut writer = BlockMaxPostingListWriter::new();
        writer.add(DocId(0), 70000); // > u16::MAX
        let data = writer.finish();

        let mut reader = BlockMaxPostingListReader::new(&data);
        assert_eq!(reader.block_max_tf(0), u16::MAX); // clamped
        // But actual TF is preserved
        assert_eq!(reader.next(), Some((DocId(0), 70000)));
    }

    #[test]
    fn block_max_flag_detected() {
        let mut bm = BlockMaxPostingListWriter::new();
        bm.add(DocId(0), 1);
        let bm_data = bm.finish();
        assert!(has_block_max(&bm_data));
        assert!(!has_positions(&bm_data));

        let mut basic = PostingListWriter::new();
        basic.add(DocId(0), 1);
        let basic_data = basic.finish();
        assert!(!has_block_max(&basic_data));
    }
}