lucisearch 0.8.0

//! Columnar store reader: access column values by doc_id.
//!
//! See [[columnar-storage]] and [[feature-aggregations-v010#Step 1]].

#[cfg(test)]
use std::cell::Cell;
use std::cell::OnceCell;

use super::writer::{ColumnType, DICT_BLOCK_MASK, DICT_BLOCK_SHIFT, DICT_BLOCK_SIZE, unpack_i64};
use crate::core::FieldId;

/// Per-column statistics for numeric columns (zonemaps).
#[derive(Clone, Debug)]
pub struct ColumnStats {
    pub null_count: u32,
    pub min: f64,
    pub max: f64,
}

/// Dispatched keyword-dictionary representation, replacing the eager
/// `dict: Vec<&'a str>` so a `KeywordBlocked` column can resolve `ordinal →
/// string` by random block seek without materializing the whole dictionary.
/// See [[optimization-keyword-dict-offset-index]].
enum KeywordDict<'a> {
    /// Non-keyword (or empty) column — no dictionary.
    None,
    /// Legacy `Keyword = 1` column: eagerly parsed at open (back-compat).
    Eager(Vec<&'a str>),
    /// `KeywordBlocked = 8` column: random-access via the block-address array.
    /// `dict` is the lazily-materialized eager `Vec`, built by `ensure_dict()`
    /// only when a bulk consumer (sort/collapse/terms-agg) opts in. Random
    /// consumers (`hit.id`, `fields=`) leave it empty and block-seek. `OnceCell`
    /// is `!Sync` but `Send` (its `T = Vec<&str>` is `Send`), which suffices:
    /// every bulk reader is thread-local under `par_iter`, never shared.
    Blocked {
        dict_body_start: usize,
        block_addrs_start: usize,
        num_entries: u32,
        dict: OnceCell<Vec<&'a str>>,
    },
}

/// Reads a single column's values from serialized bytes.
pub struct ColumnReader<'a> {
    data: &'a [u8],
    field_id: FieldId,
    col_type: ColumnType,
    doc_count: u32,
    null_bitset_start: usize,
    body_start: usize,
    /// For keyword columns: the dictionary (eager, or block-addressed + lazy).
    keyword: KeywordDict<'a>,
    /// For keyword columns: start of ordinal array
    ordinals_start: usize,
    /// For numeric columns: precomputed stats (zonemaps)
    stats: Option<ColumnStats>,
}

// Lock in the auto-trait invariant: the `OnceCell` in `KeywordDict::Blocked` is
// `Send` (its `T` is) but drops `Sync`, so `ColumnReader`/`OwnedColumn` stay
// Send-but-not-Sync. `!Sync` is sound — every bulk reader is stack-local within a
// per-segment `par_iter` closure and finished there; only string-keyed results
// (sort keys, finished agg buckets) cross threads, never the reader. `Send` is
// required transitively by the `Aggregator: Send` supertrait (`agg/mod.rs`);
// assert it here as a tripwire so a future change making the reader `!Send` fails
// *here*, loudly, instead of being papered over by one of the `unsafe impl Send`
// collectors (`bucket.rs`, `hll.rs`) and miscompiling. `Sync` is intentionally
// NOT asserted. See [[optimization-keyword-dict-offset-index]].
const _: fn() = || {
    fn assert_send<T: Send>() {}
    assert_send::<ColumnReader<'static>>();
};

#[cfg(test)]
thread_local! {
    /// Test-only counter of real lazy-dictionary **builds** (not calls). Lives
    /// at module scope so `ensure_dict()` can bump it from inside its
    /// `get_or_init` closure; `mod tests` reads it via `super::DICT_BUILDS`.
    /// Distinct from `owned.rs`'s `COLUMN_OPENS` (which counts
    /// `OwnedColumn::new`). See Test 10 in
    /// [[optimization-keyword-dict-offset-index]].
    static DICT_BUILDS: Cell<u32> = const { Cell::new(0) };
}

impl<'a> ColumnReader<'a> {
    pub fn open(data: &'a [u8]) -> Self {
        if data.len() < 7 {
            return Self::empty(data);
        }

        let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
        let col_type_byte = data[2];
        let doc_count = u32::from_le_bytes([data[3], data[4], data[5], data[6]]);

        let col_type = match col_type_byte {
            0 => ColumnType::Empty,
            1 => ColumnType::Keyword,
            2 => ColumnType::I64,
            3 => ColumnType::F64,
            4 => ColumnType::Bool,
            5 => ColumnType::ConstantI64,
            6 => ColumnType::ConstantF64,
            7 => ColumnType::BitpackedI64,
            8 => ColumnType::KeywordBlocked,
            _ => ColumnType::Empty,
        };

        if doc_count == 0 || col_type == ColumnType::Empty {
            return Self {
                data,
                field_id,
                col_type,
                doc_count: 0,
                null_bitset_start: 7,
                body_start: 7,
                keyword: KeywordDict::None,
                ordinals_start: 0,
                stats: None,
            };
        }

        let null_bytes = (doc_count as usize + 7) / 8;
        let null_bitset_start = 7;
        let mut body_start = null_bitset_start + null_bytes;

        // Parse numeric stats if present
        let stats = if col_type.is_numeric() && body_start + 20 <= data.len() {
            let null_count =
                u32::from_le_bytes(data[body_start..body_start + 4].try_into().unwrap());
            let min = f64::from_le_bytes(data[body_start + 4..body_start + 12].try_into().unwrap());
            let max =
                f64::from_le_bytes(data[body_start + 12..body_start + 20].try_into().unwrap());
            body_start += 20; // skip past stats
            Some(ColumnStats {
                null_count,
                min,
                max,
            })
        } else {
            None
        };

        let mut reader = Self {
            data,
            field_id,
            col_type,
            doc_count,
            null_bitset_start,
            body_start,
            keyword: KeywordDict::None,
            ordinals_start: 0,
            stats,
        };

        match col_type {
            // Legacy: eager parse (back-compat) — sets KeywordDict::Eager.
            ColumnType::Keyword => reader.parse_keyword_dict(),
            // New: record offsets, NO full parse. The dictionary body is walked
            // lazily by ensure_dict(); random lookups block-seek directly.
            ColumnType::KeywordBlocked => {
                let mut pos = reader.body_start;
                let num_entries = u32::from_le_bytes(reader.data[pos..pos + 4].try_into().unwrap());
                pos += 4;
                let body_len =
                    u64::from_le_bytes(reader.data[pos..pos + 8].try_into().unwrap()) as usize;
                pos += 8;
                // Unreachable on supported (64-bit usize) targets; makes the
                // "u64 prevents overflow" claim literally true. A >4 GiB body
                // cannot be mmap'd into a &[u8] on 32-bit anyway.
                debug_assert!(body_len as u64 <= usize::MAX as u64);
                let dict_body_start = pos;
                let block_addrs_start = dict_body_start + body_len;
                let num_blocks = (num_entries as usize).div_ceil(DICT_BLOCK_SIZE);
                reader.ordinals_start = block_addrs_start + num_blocks * 8;
                reader.keyword = KeywordDict::Blocked {
                    dict_body_start,
                    block_addrs_start,
                    num_entries,
                    dict: OnceCell::new(),
                };
            }
            _ => {}
        }

        reader
    }

    fn empty(data: &'a [u8]) -> Self {
        Self {
            data,
            field_id: FieldId::new(0),
            col_type: ColumnType::Empty,
            doc_count: 0,
            null_bitset_start: 0,
            body_start: 0,
            keyword: KeywordDict::None,
            ordinals_start: 0,
            stats: None,
        }
    }

    fn parse_keyword_dict(&mut self) {
        let mut pos = self.body_start;
        let num_entries = u32::from_le_bytes(self.data[pos..pos + 4].try_into().unwrap()) as usize;
        pos += 4;

        let mut dict = Vec::with_capacity(num_entries);
        for _ in 0..num_entries {
            let len = u16::from_le_bytes(self.data[pos..pos + 2].try_into().unwrap()) as usize;
            pos += 2;
            let s = std::str::from_utf8(&self.data[pos..pos + len]).unwrap();
            pos += len;
            dict.push(s);
        }
        self.ordinals_start = pos;
        self.keyword = KeywordDict::Eager(dict);
    }

    pub fn field_id(&self) -> FieldId {
        self.field_id
    }

    pub(crate) fn col_type(&self) -> ColumnType {
        self.col_type
    }

    pub fn doc_count(&self) -> u32 {
        self.doc_count
    }

    pub fn is_null(&self, doc_id: u32) -> bool {
        if doc_id >= self.doc_count {
            return true;
        }
        let byte_idx = self.null_bitset_start + (doc_id as usize / 8);
        let bit_idx = doc_id as usize % 8;
        (self.data[byte_idx] >> bit_idx) & 1 == 1
    }

    /// Get the raw dictionary ordinal for a keyword field (no string decode).
    pub fn keyword_ordinal(&self, doc_id: u32) -> Option<u32> {
        if doc_id >= self.doc_count || self.is_null(doc_id) {
            return None;
        }
        let pos = self.ordinals_start + doc_id as usize * 4;
        let ordinal = u32::from_le_bytes(self.data[pos..pos + 4].try_into().unwrap());
        if ordinal == u32::MAX {
            None
        } else {
            Some(ordinal)
        }
    }

    /// Number of unique values in the keyword dictionary.
    pub fn dict_size(&self) -> usize {
        match &self.keyword {
            KeywordDict::None => 0,
            KeywordDict::Eager(dict) => dict.len(),
            // Read num_entries DIRECTLY, never the lazy OnceCell: dict_size() is
            // called at collector construction (bucket.rs) BEFORE any
            // ensure_dict(), so reading the unbuilt OnceCell would return 0 and
            // silently degrade the terms-agg Vec::with_capacity pre-size.
            KeywordDict::Blocked { num_entries, .. } => *num_entries as usize,
        }
    }

    /// Resolve an ordinal back to its string value. `None` means "no such
    /// ordinal"; invalid UTF-8 (corruption only) panics rather than returning
    /// `None`, so the two are never conflated ([[code-must-not-lie]]).
    pub fn ordinal_to_string(&self, ordinal: u32) -> Option<&'a str> {
        match &self.keyword {
            KeywordDict::None => None,
            KeywordDict::Eager(dict) => dict.get(ordinal as usize).copied(),
            KeywordDict::Blocked {
                dict_body_start,
                block_addrs_start,
                num_entries,
                dict,
            } => {
                if ordinal >= *num_entries {
                    return None;
                }
                // BULK fast path: if a bulk consumer materialized the eager Vec
                // via ensure_dict(), serve O(1) from it. Random consumers never
                // populate it and fall through to the block seek below.
                if let Some(v) = dict.get() {
                    return v.get(ordinal as usize).copied();
                }
                // RANDOM path: block seek + ≤DICT_BLOCK_MASK-step walk, no Vec.
                let block = (ordinal >> DICT_BLOCK_SHIFT) as usize;
                let off = block_addrs_start + block * 8;
                let addr = u64::from_le_bytes(self.data[off..off + 8].try_into().unwrap()) as usize;
                let mut pos = dict_body_start + addr;
                for _ in 0..(ordinal & DICT_BLOCK_MASK) {
                    let len =
                        u16::from_le_bytes(self.data[pos..pos + 2].try_into().unwrap()) as usize;
                    pos += 2 + len;
                }
                let len = u16::from_le_bytes(self.data[pos..pos + 2].try_into().unwrap()) as usize;
                pos += 2;
                Some(
                    std::str::from_utf8(&self.data[pos..pos + len])
                        .expect("keyword dict bytes are valid UTF-8 by construction"),
                )
            }
        }
    }

    /// Bulk opt-in: materialize the eager `Vec` once into the `OnceCell` so
    /// subsequent `ordinal_to_string`/`keyword_value` calls take the O(1)
    /// populated branch. No-op for `Eager`/`None`; idempotent for `Blocked`
    /// (the `OnceCell` guarantees the O(N) body walk runs at most once). Bulk
    /// consumers (sort/collapse/terms-agg `finish`) call this before their
    /// resolve loop so N lookups are O(1) instead of O(DICT_BLOCK_SIZE) seeks.
    pub fn ensure_dict(&self) {
        if let KeywordDict::Blocked {
            dict_body_start,
            num_entries,
            dict,
            ..
        } = &self.keyword
        {
            dict.get_or_init(|| {
                #[cfg(test)]
                DICT_BUILDS.with(|c| c.set(c.get() + 1));
                let mut v = Vec::with_capacity(*num_entries as usize);
                let mut pos = *dict_body_start;
                for _ in 0..*num_entries {
                    let len =
                        u16::from_le_bytes(self.data[pos..pos + 2].try_into().unwrap()) as usize;
                    pos += 2;
                    v.push(
                        std::str::from_utf8(&self.data[pos..pos + len])
                            .expect("keyword dict bytes are valid UTF-8 by construction"),
                    );
                    pos += len;
                }
                v
            });
        }
    }

    /// Resolve a doc's keyword value: ordinal lookup then string decode (both
    /// O(1) on a populated/eager dict, block-seek otherwise). Routes through the
    /// bounds-checked `ordinal_to_string`, so a (corruption-only) out-of-range
    /// ordinal yields `None` instead of panicking — making it consistent with
    /// `ordinal_to_string`; for all writer-emitted data the values are identical.
    pub fn keyword_value(&self, doc_id: u32) -> Option<&'a str> {
        self.keyword_ordinal(doc_id)
            .and_then(|o| self.ordinal_to_string(o))
    }

    pub fn i64_value(&self, doc_id: u32) -> Option<i64> {
        if doc_id >= self.doc_count || self.is_null(doc_id) {
            return None;
        }
        if self.col_type == ColumnType::BitpackedI64 {
            let min_val = i64::from_le_bytes(
                self.data[self.body_start..self.body_start + 8]
                    .try_into()
                    .unwrap(),
            );
            let bit_width = self.data[self.body_start + 8];
            let packed_start = self.body_start + 9;
            return Some(unpack_i64(
                &self.data[packed_start..],
                doc_id as usize,
                min_val,
                bit_width,
            ));
        }
        let pos = self.body_start + doc_id as usize * 8;
        Some(i64::from_le_bytes(
            self.data[pos..pos + 8].try_into().unwrap(),
        ))
    }

    pub fn f64_value(&self, doc_id: u32) -> Option<f64> {
        if doc_id >= self.doc_count || self.is_null(doc_id) {
            return None;
        }
        let pos = self.body_start + doc_id as usize * 8;
        Some(f64::from_le_bytes(
            self.data[pos..pos + 8].try_into().unwrap(),
        ))
    }

    pub fn bool_value(&self, doc_id: u32) -> Option<bool> {
        if doc_id >= self.doc_count || self.is_null(doc_id) {
            return None;
        }
        let byte_idx = self.body_start + (doc_id as usize / 8);
        let bit_idx = doc_id as usize % 8;
        Some((self.data[byte_idx] >> bit_idx) & 1 == 1)
    }

    /// Get the numeric value as f64, regardless of underlying type.
    /// Converts i64 → f64. Returns None for non-numeric, null, or constant
    /// columns. For constant columns, use `is_constant()` + `constant_value()`.
    pub fn numeric_value(&self, doc_id: u32) -> Option<f64> {
        match self.col_type {
            ColumnType::I64 | ColumnType::BitpackedI64 => self.i64_value(doc_id).map(|v| v as f64),
            ColumnType::F64 => self.f64_value(doc_id),
            _ => None,
        }
    }

    /// Returns true if this column uses constant encoding (all non-null
    /// values are identical). Enables O(1) aggregation fast-paths.
    pub fn is_constant(&self) -> bool {
        matches!(
            self.col_type,
            ColumnType::ConstantI64 | ColumnType::ConstantF64
        )
    }

    /// Get the constant value if this is a constant-encoded column.
    pub fn constant_value(&self) -> Option<f64> {
        match self.col_type {
            ColumnType::ConstantI64 => Some(i64::from_le_bytes(
                self.data[self.body_start..self.body_start + 8]
                    .try_into()
                    .unwrap(),
            ) as f64),
            ColumnType::ConstantF64 => Some(f64::from_le_bytes(
                self.data[self.body_start..self.body_start + 8]
                    .try_into()
                    .unwrap(),
            )),
            _ => None,
        }
    }

    /// Get precomputed column statistics (min, max, null_count) for numeric
    /// columns. Returns None for non-numeric types. Enables O(1) aggregation
    /// pushdown for min/max/count on unfiltered queries.
    pub fn stats(&self) -> Option<&ColumnStats> {
        self.stats.as_ref()
    }
}

/// Reads the columnar component of a segment — multiple columns.
pub struct ColumnarReader<'a> {
    data: &'a [u8],
    columns: Vec<(FieldId, usize, usize)>, // (field_id, start_offset, end_offset)
}

impl<'a> ColumnarReader<'a> {
    pub fn open(data: &'a [u8]) -> Self {
        if data.len() < 2 {
            return Self {
                data,
                columns: Vec::new(),
            };
        }

        let num_columns = u16::from_le_bytes([data[0], data[1]]) as usize;
        let mut pos = 2usize;
        let mut columns = Vec::with_capacity(num_columns);

        for _ in 0..num_columns {
            let start = pos;
            // Parse enough to find the end of this column
            if pos + 7 > data.len() {
                break;
            }
            let field_id = FieldId::new(u16::from_le_bytes([data[pos], data[pos + 1]]));
            let col_type = data[pos + 2];
            let doc_count = u32::from_le_bytes(data[pos + 3..pos + 7].try_into().unwrap()) as usize;
            pos += 7;

            if doc_count == 0 || col_type == 0 {
                columns.push((field_id, start, pos));
                continue;
            }

            // Skip null bitset
            let null_bytes = (doc_count + 7) / 8;
            pos += null_bytes;

            // Skip numeric stats (null_count + min + max = 20 bytes)
            let is_numeric = matches!(col_type, 2 | 3 | 5 | 6 | 7);
            if is_numeric {
                pos += 20;
            }

            // Skip body based on type
            match col_type {
                1 => {
                    // Keyword: dictionary + ordinals
                    let num_entries =
                        u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
                    pos += 4;
                    for _ in 0..num_entries {
                        let len =
                            u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
                        pos += 2 + len;
                    }
                    pos += doc_count * 4; // ordinals
                }
                8 => {
                    // KeywordBlocked: O(1) skip past dict_count, body_len, the
                    // dictionary body, the block-address array, and ordinals.
                    let num_entries =
                        u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
                    pos += 4;
                    let body_len =
                        u64::from_le_bytes(data[pos..pos + 8].try_into().unwrap()) as usize;
                    pos += 8;
                    pos += body_len; // dict body
                    pos += num_entries.div_ceil(DICT_BLOCK_SIZE) * 8; // block-address array
                    pos += doc_count * 4; // ordinals
                }
                2 | 3 => {
                    // i64 or f64: 8 bytes per doc
                    pos += doc_count * 8;
                }
                4 => {
                    // Bool: bitset
                    pos += (doc_count + 7) / 8;
                }
                5 | 6 => {
                    // ConstantI64 or ConstantF64: single 8-byte value
                    pos += 8;
                }
                7 => {
                    // BitpackedI64: min(8) + bit_width(1) + packed data
                    let bit_width = data[pos + 8] as usize;
                    pos += 9; // min + bit_width
                    pos += (doc_count * bit_width + 7) / 8; // packed residuals
                }
                _ => {}
            }

            columns.push((field_id, start, pos));
        }

        Self { data, columns }
    }

    /// Get a column reader for a specific field.
    pub fn column(&self, field_id: FieldId) -> Option<ColumnReader<'a>> {
        for &(fid, start, end) in &self.columns {
            if fid == field_id {
                return Some(ColumnReader::open(&self.data[start..end]));
            }
        }
        None
    }
}

#[cfg(test)]
mod tests {
    use super::{ColumnReader, ColumnarReader, DICT_BUILDS, KeywordDict};
    use crate::columnar::writer::{
        ColumnType, ColumnValue, ColumnWriter, ColumnarWriter, DICT_BLOCK_SIZE,
    };
    use crate::core::FieldId;
    use std::collections::HashMap;

    /// Emit the pre-v3 `Keyword = 1` layout (no offset index) to exercise the
    /// `Eager` back-compat read path. No production code emits this anymore, so
    /// this helper is the only fixture for the legacy path. Layout:
    /// `[field_id][type=1][doc_count][null_bitset][dict_count][(len:u16)(bytes)×N][ordinals:u32×doc_count]`.
    fn write_legacy_keyword_column(field_id: u16, values: &[&str]) -> Vec<u8> {
        let mut dict: Vec<String> = values.iter().map(|s| s.to_string()).collect();
        dict.sort();
        dict.dedup();
        let mut ord_of: HashMap<String, u32> = HashMap::new();
        for (i, t) in dict.iter().enumerate() {
            ord_of.insert(t.clone(), i as u32);
        }
        let mut buf = Vec::new();
        buf.extend_from_slice(&field_id.to_le_bytes());
        buf.push(1u8); // ColumnType::Keyword
        buf.extend_from_slice(&(values.len() as u32).to_le_bytes());
        let null_bytes = values.len().div_ceil(8);
        buf.resize(buf.len() + null_bytes, 0u8); // no nulls
        buf.extend_from_slice(&(dict.len() as u32).to_le_bytes());
        for t in &dict {
            buf.extend_from_slice(&(t.len() as u16).to_le_bytes());
            buf.extend_from_slice(t.as_bytes());
        }
        for v in values {
            buf.extend_from_slice(&ord_of[*v].to_le_bytes());
        }
        buf
    }

    /// Test 1: read every ordinal of a >3-block dictionary back via the block
    /// seek + intra-block walk, at every inner offset and across boundaries.
    #[test]
    fn keyword_offset_index_roundtrip() {
        // Zero-padded so sorted dict order == numeric order: ordinal i == values[i].
        let values: Vec<String> = (0..200).map(|i| format!("key_{i:04}")).collect();
        let mut w = ColumnWriter::new(FieldId::new(0));
        for v in &values {
            w.add(ColumnValue::Keyword(v.clone()));
        }
        let data = w.finish();
        let r = ColumnReader::open(&data);
        assert_eq!(r.dict_size(), 200);
        for (i, v) in values.iter().enumerate() {
            assert_eq!(
                r.ordinal_to_string(i as u32),
                Some(v.as_str()),
                "ordinal {i} mismatch"
            );
        }
        assert_eq!(r.ordinal_to_string(200), None);
    }

    /// Test 2: per-doc keyword_value matches the source over 1000 docs with nulls.
    #[test]
    fn keyword_value_matches_doc() {
        let mut w = ColumnWriter::new(FieldId::new(0));
        let mut expected: Vec<Option<String>> = Vec::new();
        for doc in 0..1000u32 {
            if doc % 7 == 3 {
                w.add(ColumnValue::Null);
                expected.push(None);
            } else {
                let v = format!("val_{:03}", (doc.wrapping_mul(31).wrapping_add(17)) % 80);
                w.add(ColumnValue::Keyword(v.clone()));
                expected.push(Some(v));
            }
        }
        let data = w.finish();
        let r = ColumnReader::open(&data);
        for (doc, exp) in expected.iter().enumerate() {
            assert_eq!(
                r.keyword_value(doc as u32),
                exp.as_deref(),
                "doc {doc} mismatch"
            );
        }
    }

    /// Test 4: single-entry dict (one block of size 1).
    #[test]
    fn keyword_single_entry() {
        let mut w = ColumnWriter::new(FieldId::new(0));
        w.add(ColumnValue::Keyword("only".into()));
        let data = w.finish();
        let r = ColumnReader::open(&data);
        assert_eq!(r.dict_size(), 1);
        assert_eq!(r.ordinal_to_string(0), Some("only"));
        assert_eq!(r.ordinal_to_string(1), None);
        assert_eq!(r.keyword_value(0), Some("only"));
    }

    /// Test 5: a zero-doc column reaches `KeywordDict::None`; and the extent
    /// walk skips a blocked column correctly (the `8 =>` arm) to reach a later
    /// column.
    #[test]
    fn keyword_none_column() {
        let w = ColumnWriter::new(FieldId::new(0));
        let data = w.finish(); // no values → write_empty → doc_count 0
        let r = ColumnReader::open(&data);
        assert!(matches!(r.keyword, KeywordDict::None));
        assert_eq!(r.dict_size(), 0);
        assert_eq!(r.ordinal_to_string(0), None);
        assert_eq!(r.keyword_value(0), None);

        // Extent walk must skip a blocked column (the `8 =>` arm) to land
        // exactly on the next column.
        let mut cw = ColumnarWriter::new();
        cw.add(FieldId::new(0), ColumnValue::Keyword("alpha".into()));
        cw.add(FieldId::new(1), ColumnValue::Keyword("beta".into()));
        cw.pad_to(1);
        let cdata = cw.finish();
        let cr = ColumnarReader::open(&cdata);
        assert_eq!(
            cr.column(FieldId::new(0)).unwrap().keyword_value(0),
            Some("alpha")
        );
        assert_eq!(
            cr.column(FieldId::new(1)).unwrap().keyword_value(0),
            Some("beta")
        );
    }

    /// Test 6: a hand-emitted legacy `Keyword = 1` column reads via the `Eager`
    /// back-compat path.
    #[test]
    fn legacy_keyword_column_still_reads() {
        let data = write_legacy_keyword_column(0, &["cherry", "apple", "banana", "apple"]);
        let r = ColumnReader::open(&data);
        assert_eq!(r.col_type(), ColumnType::Keyword);
        assert!(matches!(r.keyword, KeywordDict::Eager(_)));
        assert_eq!(r.dict_size(), 3); // apple, banana, cherry (sorted, deduped)
        assert_eq!(r.keyword_value(0), Some("cherry"));
        assert_eq!(r.keyword_value(1), Some("apple"));
        assert_eq!(r.keyword_value(2), Some("banana"));
        assert_eq!(r.keyword_value(3), Some("apple"));
        assert_eq!(r.ordinal_to_string(0), Some("apple"));
        assert_eq!(r.ordinal_to_string(2), Some("cherry"));
    }

    /// Test 8: exhaustive boundaries at N ∈ {K, K+1, 2K} catch off-by-one in
    /// `ord >> SHIFT` / `& MASK` at exact-multiple and partial-block edges.
    #[test]
    fn block_boundary_exhaustive() {
        let k = DICT_BLOCK_SIZE;
        for &n in &[k, k + 1, 2 * k] {
            let values: Vec<String> = (0..n).map(|i| format!("k{i:05}")).collect();
            let mut w = ColumnWriter::new(FieldId::new(0));
            for v in &values {
                w.add(ColumnValue::Keyword(v.clone()));
            }
            let data = w.finish();
            let r = ColumnReader::open(&data);
            assert_eq!(r.dict_size(), n, "N={n} dict_size");
            // Every ordinal round-trips, including the block boundaries.
            for (i, v) in values.iter().enumerate() {
                assert_eq!(
                    r.ordinal_to_string(i as u32),
                    Some(v.as_str()),
                    "N={n} ordinal {i}"
                );
            }
            assert_eq!(r.ordinal_to_string(n as u32), None, "N={n} out-of-range");
        }
    }

    /// Test 10: the lazy dictionary builds at most once (the Option-B
    /// load-bearing invariant), and `dict_size()` is correct before any build.
    #[test]
    fn ensure_dict_builds_once() {
        let values: Vec<String> = (0..200).map(|i| format!("v{i:04}")).collect();
        let mut w = ColumnWriter::new(FieldId::new(0));
        for v in &values {
            w.add(ColumnValue::Keyword(v.clone()));
        }
        let data = w.finish();
        let r = ColumnReader::open(&data);

        // (c) dict_size() reads num_entries directly — correct BEFORE any build.
        assert_eq!(r.dict_size(), 200);

        // (a) idempotent: 1000 ensure_dict() calls → exactly one build.
        DICT_BUILDS.with(|c| c.set(0));
        for _ in 0..1000 {
            r.ensure_dict();
        }
        assert_eq!(DICT_BUILDS.with(|c| c.get()), 1);

        // (b) a 5000-lookup bulk pass serves from the populated Vec, no rebuild.
        for i in 0..5000u32 {
            let ord = i % 200;
            assert_eq!(
                r.ordinal_to_string(ord),
                Some(values[ord as usize].as_str())
            );
        }
        assert_eq!(DICT_BUILDS.with(|c| c.get()), 1);
    }

    /// Test 12: reading a corrupt (invalid-UTF-8) dict entry fails LOUD — never
    /// a silent `None` (which is reserved for "no such ordinal").
    #[test]
    #[should_panic(expected = "valid UTF-8")]
    fn blocked_corrupt_entry_fails_loud() {
        let mut w = ColumnWriter::new(FieldId::new(0));
        for i in 0..10 {
            w.add(ColumnValue::Keyword(format!("value{i}")));
        }
        let mut data = w.finish();
        // Flip the first 'v' (in "value0", the body's first entry) to an
        // invalid UTF-8 start byte. Length prefixes precede it and are not 'v'.
        let pos = data
            .iter()
            .position(|&b| b == b'v')
            .expect("dict body contains 'v'");
        data[pos] = 0xFF;
        let r = ColumnReader::open(&data);
        for ord in 0..r.dict_size() as u32 {
            let _ = r.ordinal_to_string(ord);
        }
    }
}