hermes_core/structures/
fast_field.rs

1//! Fast field columnar storage for efficient filtering and sorting.
2//!
3//! Stores one column per fast-field, indexed by doc_id for O(1) access.
4//! Supports u64, i64, f64, and text (dictionary-encoded ordinal) columns.
5//!
6//! ## File format (`.fast`)
7//!
8//! ```text
9//! [column 0 bit-packed data] [column 1 data] ... [column N data]
10//! [dict 0: len-prefixed sorted strings] [dict 1] ...  (text columns only)
11//! [TOC: FastFieldTocEntry × num_columns]
12//! [footer: toc_offset(8) + num_columns(4) + magic(4)]  = 16 bytes
13//! ```
14
15use std::collections::BTreeMap;
16use std::io::{self, Read, Write};
17
18use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
19
20// ── Constants ─────────────────────────────────────────────────────────────
21
22/// Magic number for `.fast` file footer ("FST1" in LE)
23pub const FAST_FIELD_MAGIC: u32 = 0x31545346;
24
25/// Footer size: toc_offset(8) + num_columns(4) + magic(4) = 16
26pub const FAST_FIELD_FOOTER_SIZE: u64 = 16;
27
28/// Sentinel ordinal for missing text values
29pub const TEXT_MISSING_ORDINAL: u64 = u64::MAX;
30
31// ── Column type ───────────────────────────────────────────────────────────
32
33/// Type of a fast-field column (stored in TOC).
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
35#[repr(u8)]
36pub enum FastFieldColumnType {
37    U64 = 0,
38    I64 = 1,
39    F64 = 2,
40    TextOrdinal = 3,
41}
42
43impl FastFieldColumnType {
44    pub fn from_u8(v: u8) -> Option<Self> {
45        match v {
46            0 => Some(Self::U64),
47            1 => Some(Self::I64),
48            2 => Some(Self::F64),
49            3 => Some(Self::TextOrdinal),
50            _ => None,
51        }
52    }
53}
54
55// ── Encoding helpers ──────────────────────────────────────────────────────
56
57/// Zigzag-encode an i64 to u64 (small absolute values → small u64).
58#[inline]
59pub fn zigzag_encode(v: i64) -> u64 {
60    ((v << 1) ^ (v >> 63)) as u64
61}
62
63/// Zigzag-decode a u64 back to i64.
64#[inline]
65pub fn zigzag_decode(v: u64) -> i64 {
66    ((v >> 1) as i64) ^ -((v & 1) as i64)
67}
68
69/// Encode f64 to u64 preserving total order.
70/// Positive floats: flip sign bit (so they sort above negatives).
71/// Negative floats: flip all bits (so they sort in reverse magnitude).
72#[inline]
73pub fn f64_to_sortable_u64(f: f64) -> u64 {
74    let bits = f.to_bits();
75    if (bits >> 63) == 0 {
76        bits ^ (1u64 << 63) // positive: flip sign bit
77    } else {
78        !bits // negative: flip all bits
79    }
80}
81
82/// Decode sortable u64 back to f64.
83#[inline]
84pub fn sortable_u64_to_f64(v: u64) -> f64 {
85    let bits = if (v >> 63) != 0 {
86        v ^ (1u64 << 63) // was positive: unflip sign bit
87    } else {
88        !v // was negative: unflip all bits
89    };
90    f64::from_bits(bits)
91}
92
93/// Minimum number of bits needed to represent `val`.
94#[inline]
95pub fn bits_needed_u64(val: u64) -> u8 {
96    if val == 0 {
97        0
98    } else {
99        64 - val.leading_zeros() as u8
100    }
101}
102
103// ── Bit-packing ───────────────────────────────────────────────────────────
104
105/// Pack `values` at `bits_per_value` bits each into `out`.
106/// `out` must be large enough: `ceil(values.len() * bits_per_value / 8)` bytes.
107pub fn bitpack_write(values: &[u64], bits_per_value: u8, out: &mut Vec<u8>) {
108    if bits_per_value == 0 {
109        return; // all values are the same (constant column)
110    }
111    let bpv = bits_per_value as usize;
112    let total_bits = values.len() * bpv;
113    let total_bytes = total_bits.div_ceil(8);
114    out.reserve(total_bytes);
115
116    let start = out.len();
117    out.resize(start + total_bytes, 0);
118    let buf = &mut out[start..];
119
120    for (i, &val) in values.iter().enumerate() {
121        let bit_offset = i * bpv;
122        let byte_offset = bit_offset / 8;
123        let bit_shift = bit_offset % 8;
124
125        // Write across byte boundaries (up to 9 bytes for 64-bit values)
126        let mut remaining_bits = bpv;
127        let mut v = val;
128        let mut bo = byte_offset;
129        let mut bs = bit_shift;
130
131        while remaining_bits > 0 {
132            let can_write = (8 - bs).min(remaining_bits);
133            let mask = (1u64 << can_write) - 1;
134            buf[bo] |= ((v & mask) << bs) as u8;
135            v >>= can_write;
136            remaining_bits -= can_write;
137            bo += 1;
138            bs = 0;
139        }
140    }
141}
142
143/// Read value at `index` from bit-packed data.
144///
145/// Fast path: reads a single unaligned u64 (LE) covering the target bits,
146/// shifts and masks. This compiles to ~4 instructions on x86/ARM and avoids
147/// the per-byte loop entirely for bpv ≤ 56.
148#[inline]
149pub fn bitpack_read(data: &[u8], bits_per_value: u8, index: usize) -> u64 {
150    if bits_per_value == 0 {
151        return 0;
152    }
153    let bpv = bits_per_value as usize;
154    let bit_offset = index * bpv;
155    let byte_offset = bit_offset / 8;
156    let bit_shift = bit_offset % 8;
157
158    // Fast path: if we can read 8 bytes starting at byte_offset, do a single
159    // unaligned LE u64 load, shift, and mask. Covers bpv ≤ 56 (56+7=63 < 64).
160    if byte_offset + 8 <= data.len() {
161        let raw = u64::from_le_bytes(data[byte_offset..byte_offset + 8].try_into().unwrap());
162        let mask = if bpv >= 64 {
163            u64::MAX
164        } else {
165            (1u64 << bpv) - 1
166        };
167        return (raw >> bit_shift) & mask;
168    }
169
170    // Slow path for the last few values near the end of the buffer
171    let mut result: u64 = 0;
172    let mut remaining_bits = bpv;
173    let mut bo = byte_offset;
174    let mut bs = bit_shift;
175    let mut out_shift = 0;
176
177    while remaining_bits > 0 {
178        let can_read = (8 - bs).min(remaining_bits);
179        let mask = ((1u64 << can_read) - 1) as u8;
180        let byte_val = if bo < data.len() { data[bo] } else { 0 };
181        result |= (((byte_val >> bs) & mask) as u64) << out_shift;
182        remaining_bits -= can_read;
183        out_shift += can_read;
184        bo += 1;
185        bs = 0;
186    }
187
188    result
189}
190
191// ── TOC entry ─────────────────────────────────────────────────────────────
192
193/// On-disk TOC entry for a fast-field column.
194///
195/// Wire: field_id(4) + column_type(1) + data_offset(8) + num_docs(4) +
196///       min_value(8) + bits_per_value(1) + dict_offset(8) + dict_count(4) = 38 bytes
197#[derive(Debug, Clone)]
198pub struct FastFieldTocEntry {
199    pub field_id: u32,
200    pub column_type: FastFieldColumnType,
201    pub data_offset: u64,
202    pub num_docs: u32,
203    pub min_value: u64,
204    pub bits_per_value: u8,
205    /// Byte offset of the text dictionary section (0 for numeric columns).
206    pub dict_offset: u64,
207    /// Number of entries in the text dictionary (0 for numeric columns).
208    pub dict_count: u32,
209}
210
211pub const FAST_FIELD_TOC_ENTRY_SIZE: usize = 4 + 1 + 8 + 4 + 8 + 1 + 8 + 4; // 38
212
213impl FastFieldTocEntry {
214    pub fn write_to(&self, w: &mut dyn Write) -> io::Result<()> {
215        w.write_u32::<LittleEndian>(self.field_id)?;
216        w.write_u8(self.column_type as u8)?;
217        w.write_u64::<LittleEndian>(self.data_offset)?;
218        w.write_u32::<LittleEndian>(self.num_docs)?;
219        w.write_u64::<LittleEndian>(self.min_value)?;
220        w.write_u8(self.bits_per_value)?;
221        w.write_u64::<LittleEndian>(self.dict_offset)?;
222        w.write_u32::<LittleEndian>(self.dict_count)?;
223        Ok(())
224    }
225
226    pub fn read_from(r: &mut dyn Read) -> io::Result<Self> {
227        let field_id = r.read_u32::<LittleEndian>()?;
228        let ct = r.read_u8()?;
229        let column_type = FastFieldColumnType::from_u8(ct)
230            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad column type"))?;
231        let data_offset = r.read_u64::<LittleEndian>()?;
232        let num_docs = r.read_u32::<LittleEndian>()?;
233        let min_value = r.read_u64::<LittleEndian>()?;
234        let bits_per_value = r.read_u8()?;
235        let dict_offset = r.read_u64::<LittleEndian>()?;
236        let dict_count = r.read_u32::<LittleEndian>()?;
237        Ok(Self {
238            field_id,
239            column_type,
240            data_offset,
241            num_docs,
242            min_value,
243            bits_per_value,
244            dict_offset,
245            dict_count,
246        })
247    }
248}
249
250// ── Writer ────────────────────────────────────────────────────────────────
251
252/// Collects values during indexing and serializes a single fast-field column.
253pub struct FastFieldWriter {
254    pub column_type: FastFieldColumnType,
255    /// Raw u64 values indexed by local doc_id.
256    /// For text: stores temporary placeholder 0, replaced by ordinals at build time.
257    values: Vec<u64>,
258    /// For TextOrdinal: maps original string → insertion order.
259    /// Sorted dictionary is built at serialization time.
260    text_values: Option<BTreeMap<String, u32>>,
261    /// For TextOrdinal: per-doc string values (parallel to `values`).
262    text_per_doc: Option<Vec<Option<String>>>,
263}
264
265impl FastFieldWriter {
266    /// Create a writer for a numeric column (u64/i64/f64).
267    pub fn new_numeric(column_type: FastFieldColumnType) -> Self {
268        debug_assert!(matches!(
269            column_type,
270            FastFieldColumnType::U64 | FastFieldColumnType::I64 | FastFieldColumnType::F64
271        ));
272        Self {
273            column_type,
274            values: Vec::new(),
275            text_values: None,
276            text_per_doc: None,
277        }
278    }
279
280    /// Create a writer for a text ordinal column.
281    pub fn new_text() -> Self {
282        Self {
283            column_type: FastFieldColumnType::TextOrdinal,
284            values: Vec::new(),
285            text_values: Some(BTreeMap::new()),
286            text_per_doc: Some(Vec::new()),
287        }
288    }
289
290    /// Record a numeric value for `doc_id`. Fills gaps with 0.
291    pub fn add_u64(&mut self, doc_id: u32, value: u64) {
292        let idx = doc_id as usize;
293        if idx >= self.values.len() {
294            self.values.resize(idx + 1, 0);
295            if let Some(ref mut tpd) = self.text_per_doc {
296                tpd.resize(idx + 1, None);
297            }
298        }
299        self.values[idx] = value;
300    }
301
302    /// Record an i64 value (zigzag-encoded).
303    pub fn add_i64(&mut self, doc_id: u32, value: i64) {
304        self.add_u64(doc_id, zigzag_encode(value));
305    }
306
307    /// Record an f64 value (sortable-encoded).
308    pub fn add_f64(&mut self, doc_id: u32, value: f64) {
309        self.add_u64(doc_id, f64_to_sortable_u64(value));
310    }
311
312    /// Record a text value (dictionary-encoded at build time).
313    pub fn add_text(&mut self, doc_id: u32, value: &str) {
314        let idx = doc_id as usize;
315        if idx >= self.values.len() {
316            self.values.resize(idx + 1, 0);
317        }
318        if let Some(ref mut tpd) = self.text_per_doc {
319            if idx >= tpd.len() {
320                tpd.resize(idx + 1, None);
321            }
322            tpd[idx] = Some(value.to_string());
323        }
324        if let Some(ref mut dict) = self.text_values {
325            let next_id = dict.len() as u32;
326            dict.entry(value.to_string()).or_insert(next_id);
327        }
328    }
329
330    /// Ensure the values array covers `num_docs` entries.
331    pub fn pad_to(&mut self, num_docs: u32) {
332        let n = num_docs as usize;
333        if self.values.len() < n {
334            self.values.resize(n, 0);
335            if let Some(ref mut tpd) = self.text_per_doc {
336                tpd.resize(n, None);
337            }
338        }
339    }
340
341    /// Number of documents in this column.
342    pub fn num_docs(&self) -> u32 {
343        self.values.len() as u32
344    }
345
346    /// Serialize column data to writer. Returns (data_offset_start, data_size).
347    ///
348    /// For text columns, also writes dictionary after the column data.
349    /// Returns `(toc_entry_without_offsets, dict_bytes_written)`.
350    pub fn serialize(
351        &mut self,
352        writer: &mut dyn Write,
353        data_offset: u64,
354    ) -> io::Result<(FastFieldTocEntry, u64)> {
355        let num_docs = self.values.len() as u32;
356
357        // For text ordinal: resolve per-doc strings to sorted ordinals
358        if self.column_type == FastFieldColumnType::TextOrdinal {
359            self.resolve_text_ordinals();
360        }
361
362        // Compute min/max for bit-packing
363        let (min_value, bits_per_value) = if self.values.is_empty() {
364            (0u64, 0u8)
365        } else {
366            let min_val = *self.values.iter().min().unwrap();
367            let max_val = *self.values.iter().max().unwrap();
368            let range = max_val - min_val;
369            (min_val, bits_needed_u64(range))
370        };
371
372        // Bit-pack values (min-subtracted)
373        let mut packed = Vec::new();
374        if bits_per_value > 0 {
375            let shifted: Vec<u64> = self.values.iter().map(|&v| v - min_value).collect();
376            bitpack_write(&shifted, bits_per_value, &mut packed);
377        }
378
379        // Write column data
380        writer.write_all(&packed)?;
381        let data_end = data_offset + packed.len() as u64;
382
383        // Write text dictionary (if applicable)
384        let (dict_offset, dict_count, dict_bytes) =
385            if self.column_type == FastFieldColumnType::TextOrdinal {
386                let dict_off = data_end;
387                let (count, bytes) = self.write_text_dictionary(writer)?;
388                (dict_off, count, bytes)
389            } else {
390                (0u64, 0u32, 0u64)
391            };
392
393        let toc = FastFieldTocEntry {
394            field_id: 0, // set by caller
395            column_type: self.column_type,
396            data_offset,
397            num_docs,
398            min_value,
399            bits_per_value,
400            dict_offset,
401            dict_count,
402        };
403
404        Ok((toc, packed.len() as u64 + dict_bytes))
405    }
406
407    /// Resolve text per-doc values to sorted ordinals.
408    fn resolve_text_ordinals(&mut self) {
409        let dict = self.text_values.as_ref().expect("text_values required");
410        let tpd = self.text_per_doc.as_ref().expect("text_per_doc required");
411
412        // Build sorted ordinal map: BTreeMap iterates in sorted order
413        let sorted_ordinals: BTreeMap<&str, u64> = dict
414            .keys()
415            .enumerate()
416            .map(|(ord, key)| (key.as_str(), ord as u64))
417            .collect();
418
419        // Assign ordinals to values
420        for (i, doc_text) in tpd.iter().enumerate() {
421            match doc_text {
422                Some(text) => {
423                    self.values[i] = sorted_ordinals[text.as_str()];
424                }
425                None => {
426                    self.values[i] = TEXT_MISSING_ORDINAL;
427                }
428            }
429        }
430    }
431
432    /// Write len-prefixed sorted strings. Returns (dict_count, bytes_written).
433    fn write_text_dictionary(&self, writer: &mut dyn Write) -> io::Result<(u32, u64)> {
434        let dict = self.text_values.as_ref().expect("text_values required");
435        let mut bytes_written = 0u64;
436
437        // BTreeMap keys are already sorted
438        let count = dict.len() as u32;
439        for key in dict.keys() {
440            let key_bytes = key.as_bytes();
441            writer.write_u32::<LittleEndian>(key_bytes.len() as u32)?;
442            writer.write_all(key_bytes)?;
443            bytes_written += 4 + key_bytes.len() as u64;
444        }
445
446        Ok((count, bytes_written))
447    }
448}
449
450// ── Reader ────────────────────────────────────────────────────────────────
451
452use crate::directories::OwnedBytes;
453
454/// Reads a single fast-field column from mmap/buffer with O(1) doc_id access.
455///
456/// **Zero-copy**: column data and text dictionary are borrowed slices of the
457/// underlying mmap / `OwnedBytes`; no heap allocation per column.
458pub struct FastFieldReader {
459    pub column_type: FastFieldColumnType,
460    pub num_docs: u32,
461    min_value: u64,
462    bits_per_value: u8,
463    /// Bit-packed column data — zero-copy slice of the `.fast` file.
464    data: OwnedBytes,
465    /// Text dictionary: sorted strings (only for TextOrdinal).
466    text_dict: Option<TextDictReader>,
467}
468
469impl FastFieldReader {
470    /// Open a column from an `OwnedBytes` file buffer using a TOC entry.
471    /// Zero-copy: the returned reader borrows slices of `file_data`.
472    pub fn open(file_data: &OwnedBytes, toc: &FastFieldTocEntry) -> io::Result<Self> {
473        let data_start = toc.data_offset as usize;
474        let bpv = toc.bits_per_value as usize;
475        let data_len = if bpv == 0 {
476            0
477        } else {
478            (toc.num_docs as usize * bpv).div_ceil(8)
479        };
480        let data_end = data_start + data_len;
481
482        if data_end > file_data.len() {
483            return Err(io::Error::new(
484                io::ErrorKind::UnexpectedEof,
485                "fast field data out of bounds",
486            ));
487        }
488
489        // Zero-copy slice — shares the underlying mmap/Arc<Vec>
490        let data = file_data.slice(data_start..data_end);
491
492        let text_dict = if toc.column_type == FastFieldColumnType::TextOrdinal && toc.dict_count > 0
493        {
494            let dict_start = toc.dict_offset as usize;
495            Some(TextDictReader::open(file_data, dict_start, toc.dict_count)?)
496        } else {
497            None
498        };
499
500        Ok(Self {
501            column_type: toc.column_type,
502            num_docs: toc.num_docs,
503            min_value: toc.min_value,
504            bits_per_value: toc.bits_per_value,
505            data,
506            text_dict,
507        })
508    }
509
510    /// Get raw u64 value for a doc_id. Returns 0/sentinel for out-of-range.
511    #[inline]
512    pub fn get_u64(&self, doc_id: u32) -> u64 {
513        if doc_id >= self.num_docs {
514            return 0;
515        }
516        if self.bits_per_value == 0 {
517            return self.min_value;
518        }
519        let raw = bitpack_read(self.data.as_slice(), self.bits_per_value, doc_id as usize);
520        raw + self.min_value
521    }
522
523    /// Get decoded i64 value (zigzag-decoded).
524    #[inline]
525    pub fn get_i64(&self, doc_id: u32) -> i64 {
526        zigzag_decode(self.get_u64(doc_id))
527    }
528
529    /// Get decoded f64 value (sortable-decoded).
530    #[inline]
531    pub fn get_f64(&self, doc_id: u32) -> f64 {
532        sortable_u64_to_f64(self.get_u64(doc_id))
533    }
534
535    /// Get the text ordinal for a doc_id. Returns TEXT_MISSING_ORDINAL if missing.
536    #[inline]
537    pub fn get_ordinal(&self, doc_id: u32) -> u64 {
538        self.get_u64(doc_id)
539    }
540
541    /// Get the text string for a doc_id (looks up ordinal in dictionary).
542    /// Returns None if the doc has no value or ordinal is missing.
543    pub fn get_text(&self, doc_id: u32) -> Option<&str> {
544        let ordinal = self.get_ordinal(doc_id);
545        if ordinal == TEXT_MISSING_ORDINAL {
546            return None;
547        }
548        self.text_dict.as_ref().and_then(|d| d.get(ordinal as u32))
549    }
550
551    /// Look up text string → ordinal. Returns None if not found.
552    pub fn text_ordinal(&self, text: &str) -> Option<u64> {
553        self.text_dict.as_ref().and_then(|d| d.ordinal(text))
554    }
555
556    /// Access the text dictionary reader (if this is a text column).
557    pub fn text_dict(&self) -> Option<&TextDictReader> {
558        self.text_dict.as_ref()
559    }
560
561    /// Min value stored in this column (before encoding).
562    pub fn min_value(&self) -> u64 {
563        self.min_value
564    }
565
566    /// Bits per value in this column.
567    pub fn bits_per_value(&self) -> u8 {
568        self.bits_per_value
569    }
570
571    /// Raw packed column bytes — for stackable merge (zero-copy).
572    pub fn raw_data(&self) -> &OwnedBytes {
573        &self.data
574    }
575}
576
577// ── Text dictionary ───────────────────────────────────────────────────────
578
579/// Sorted dictionary for text ordinal columns.
580///
581/// **Zero-copy**: the dictionary data is a shared slice of the `.fast` file.
582/// An offset table (one u32 per entry) is built at open time so that
583/// individual strings can be accessed without scanning.
584pub struct TextDictReader {
585    /// The raw dictionary bytes from the `.fast` file (zero-copy).
586    data: OwnedBytes,
587    /// Per-entry (offset, len) pairs into `data` — built at open time.
588    offsets: Vec<(u32, u32)>,
589}
590
591impl TextDictReader {
592    /// Open a zero-copy text dictionary from `file_data` starting at `dict_start`.
593    pub fn open(file_data: &OwnedBytes, dict_start: usize, count: u32) -> io::Result<Self> {
594        // First pass: scan len-prefixed entries to build offset table
595        let dict_slice = file_data.as_slice();
596        let mut pos = dict_start;
597        let mut offsets = Vec::with_capacity(count as usize);
598
599        for _ in 0..count {
600            if pos + 4 > dict_slice.len() {
601                return Err(io::Error::new(
602                    io::ErrorKind::UnexpectedEof,
603                    "text dict truncated",
604                ));
605            }
606            let len = u32::from_le_bytes([
607                dict_slice[pos],
608                dict_slice[pos + 1],
609                dict_slice[pos + 2],
610                dict_slice[pos + 3],
611            ]) as usize;
612            pos += 4;
613            if pos + len > dict_slice.len() {
614                return Err(io::Error::new(
615                    io::ErrorKind::UnexpectedEof,
616                    "text dict entry truncated",
617                ));
618            }
619            // Validate UTF-8 eagerly
620            std::str::from_utf8(&dict_slice[pos..pos + len]).map_err(|e| {
621                io::Error::new(io::ErrorKind::InvalidData, format!("invalid utf8: {}", e))
622            })?;
623            offsets.push((pos as u32, len as u32));
624            pos += len;
625        }
626
627        // Slice the full dict region (dict_start..pos) zero-copy
628        let data = file_data.slice(dict_start..pos);
629
630        // Adjust offsets to be relative to `data` (subtract dict_start)
631        for entry in offsets.iter_mut() {
632            entry.0 -= dict_start as u32;
633        }
634
635        Ok(Self { data, offsets })
636    }
637
638    /// Get string by ordinal — zero-copy borrow from the underlying file data.
639    pub fn get(&self, ordinal: u32) -> Option<&str> {
640        let &(off, len) = self.offsets.get(ordinal as usize)?;
641        let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
642        // Safety: we validated UTF-8 in open()
643        Some(unsafe { std::str::from_utf8_unchecked(slice) })
644    }
645
646    /// Binary search for a string → ordinal.
647    pub fn ordinal(&self, text: &str) -> Option<u64> {
648        self.offsets
649            .binary_search_by(|&(off, len)| {
650                let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
651                // Safety: validated UTF-8 in open()
652                let entry = unsafe { std::str::from_utf8_unchecked(slice) };
653                entry.cmp(text)
654            })
655            .ok()
656            .map(|i| i as u64)
657    }
658
659    /// Number of entries in the dictionary.
660    pub fn len(&self) -> u32 {
661        self.offsets.len() as u32
662    }
663
664    /// Whether the dictionary is empty.
665    pub fn is_empty(&self) -> bool {
666        self.offsets.is_empty()
667    }
668
669    /// Iterate all entries.
670    pub fn iter(&self) -> impl Iterator<Item = &str> {
671        self.offsets.iter().map(|&(off, len)| {
672            let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
673            unsafe { std::str::from_utf8_unchecked(slice) }
674        })
675    }
676}
677
678// ── File-level write/read ─────────────────────────────────────────────────
679
680/// Write fast-field TOC + footer.
681pub fn write_fast_field_toc_and_footer(
682    writer: &mut dyn Write,
683    toc_offset: u64,
684    entries: &[FastFieldTocEntry],
685) -> io::Result<()> {
686    for e in entries {
687        e.write_to(writer)?;
688    }
689    writer.write_u64::<LittleEndian>(toc_offset)?;
690    writer.write_u32::<LittleEndian>(entries.len() as u32)?;
691    writer.write_u32::<LittleEndian>(FAST_FIELD_MAGIC)?;
692    Ok(())
693}
694
695/// Read fast-field footer from the last 16 bytes.
696/// Returns (toc_offset, num_columns).
697pub fn read_fast_field_footer(file_data: &[u8]) -> io::Result<(u64, u32)> {
698    let len = file_data.len();
699    if len < FAST_FIELD_FOOTER_SIZE as usize {
700        return Err(io::Error::new(
701            io::ErrorKind::UnexpectedEof,
702            "fast field file too small for footer",
703        ));
704    }
705    let footer = &file_data[len - FAST_FIELD_FOOTER_SIZE as usize..];
706    let mut cursor = std::io::Cursor::new(footer);
707    let toc_offset = cursor.read_u64::<LittleEndian>()?;
708    let num_columns = cursor.read_u32::<LittleEndian>()?;
709    let magic = cursor.read_u32::<LittleEndian>()?;
710    if magic != FAST_FIELD_MAGIC {
711        return Err(io::Error::new(
712            io::ErrorKind::InvalidData,
713            format!("bad fast field magic: 0x{:08x}", magic),
714        ));
715    }
716    Ok((toc_offset, num_columns))
717}
718
719/// Read all TOC entries from file data.
720pub fn read_fast_field_toc(
721    file_data: &[u8],
722    toc_offset: u64,
723    num_columns: u32,
724) -> io::Result<Vec<FastFieldTocEntry>> {
725    let start = toc_offset as usize;
726    let expected = num_columns as usize * FAST_FIELD_TOC_ENTRY_SIZE;
727    if start + expected > file_data.len() {
728        return Err(io::Error::new(
729            io::ErrorKind::UnexpectedEof,
730            "fast field TOC out of bounds",
731        ));
732    }
733    let mut cursor = std::io::Cursor::new(&file_data[start..start + expected]);
734    let mut entries = Vec::with_capacity(num_columns as usize);
735    for _ in 0..num_columns {
736        entries.push(FastFieldTocEntry::read_from(&mut cursor)?);
737    }
738    Ok(entries)
739}
740
741// ── Tests ─────────────────────────────────────────────────────────────────
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746
747    #[test]
748    fn test_zigzag_roundtrip() {
749        for v in [0i64, 1, -1, 42, -42, i64::MAX, i64::MIN] {
750            assert_eq!(zigzag_decode(zigzag_encode(v)), v);
751        }
752    }
753
754    #[test]
755    fn test_f64_sortable_roundtrip() {
756        for v in [0.0f64, 1.0, -1.0, f64::MAX, f64::MIN, f64::MIN_POSITIVE] {
757            assert_eq!(sortable_u64_to_f64(f64_to_sortable_u64(v)), v);
758        }
759    }
760
761    #[test]
762    fn test_f64_sortable_order() {
763        let values = [-100.0f64, -1.0, -0.0, 0.0, 0.5, 1.0, 100.0];
764        let encoded: Vec<u64> = values.iter().map(|&v| f64_to_sortable_u64(v)).collect();
765        for i in 1..encoded.len() {
766            assert!(
767                encoded[i] >= encoded[i - 1],
768                "{} >= {} failed for {} vs {}",
769                encoded[i],
770                encoded[i - 1],
771                values[i],
772                values[i - 1]
773            );
774        }
775    }
776
777    #[test]
778    fn test_bitpack_roundtrip() {
779        let values: Vec<u64> = vec![0, 3, 7, 15, 0, 1, 6, 12];
780        let bpv = 4u8;
781        let mut packed = Vec::new();
782        bitpack_write(&values, bpv, &mut packed);
783
784        for (i, &expected) in values.iter().enumerate() {
785            let got = bitpack_read(&packed, bpv, i);
786            assert_eq!(got, expected, "index {}", i);
787        }
788    }
789
790    #[test]
791    fn test_bitpack_various_widths() {
792        for bpv in [1u8, 2, 3, 5, 7, 8, 13, 16, 32, 64] {
793            let max_val = if bpv == 64 {
794                u64::MAX
795            } else {
796                (1u64 << bpv) - 1
797            };
798            let values: Vec<u64> = (0..100)
799                .map(|i: u64| {
800                    if max_val == u64::MAX {
801                        i
802                    } else {
803                        i % (max_val + 1)
804                    }
805                })
806                .collect();
807            let mut packed = Vec::new();
808            bitpack_write(&values, bpv, &mut packed);
809
810            for (i, &expected) in values.iter().enumerate() {
811                let got = bitpack_read(&packed, bpv, i);
812                assert_eq!(got, expected, "bpv={} index={}", bpv, i);
813            }
814        }
815    }
816
817    /// Helper: wrap a Vec<u8> in OwnedBytes for tests.
818    fn owned(buf: Vec<u8>) -> OwnedBytes {
819        OwnedBytes::new(buf)
820    }
821
822    #[test]
823    fn test_writer_reader_u64_roundtrip() {
824        let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
825        writer.add_u64(0, 100);
826        writer.add_u64(1, 200);
827        writer.add_u64(2, 150);
828        writer.add_u64(4, 300); // gap at doc_id=3
829        writer.pad_to(5);
830
831        let mut buf = Vec::new();
832        let (mut toc, _bytes) = writer.serialize(&mut buf, 0).unwrap();
833        toc.field_id = 42;
834
835        // Write TOC + footer
836        let toc_offset = buf.len() as u64;
837        write_fast_field_toc_and_footer(&mut buf, toc_offset, &[toc]).unwrap();
838
839        // Read back
840        let ob = owned(buf);
841        let (toc_off, num_cols) = read_fast_field_footer(&ob).unwrap();
842        assert_eq!(num_cols, 1);
843        let tocs = read_fast_field_toc(&ob, toc_off, num_cols).unwrap();
844        assert_eq!(tocs.len(), 1);
845        assert_eq!(tocs[0].field_id, 42);
846
847        let reader = FastFieldReader::open(&ob, &tocs[0]).unwrap();
848        assert_eq!(reader.get_u64(0), 100);
849        assert_eq!(reader.get_u64(1), 200);
850        assert_eq!(reader.get_u64(2), 150);
851        assert_eq!(reader.get_u64(3), 0); // gap filled with 0
852        assert_eq!(reader.get_u64(4), 300);
853    }
854
855    #[test]
856    fn test_writer_reader_i64_roundtrip() {
857        let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::I64);
858        writer.add_i64(0, -100);
859        writer.add_i64(1, 50);
860        writer.add_i64(2, 0);
861        writer.pad_to(3);
862
863        let mut buf = Vec::new();
864        let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
865        let ob = owned(buf);
866        let reader = FastFieldReader::open(&ob, &toc).unwrap();
867        assert_eq!(reader.get_i64(0), -100);
868        assert_eq!(reader.get_i64(1), 50);
869        assert_eq!(reader.get_i64(2), 0);
870    }
871
872    #[test]
873    fn test_writer_reader_f64_roundtrip() {
874        let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::F64);
875        writer.add_f64(0, -1.5);
876        writer.add_f64(1, 3.15);
877        writer.add_f64(2, 0.0);
878        writer.pad_to(3);
879
880        let mut buf = Vec::new();
881        let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
882        let ob = owned(buf);
883        let reader = FastFieldReader::open(&ob, &toc).unwrap();
884        assert_eq!(reader.get_f64(0), -1.5);
885        assert_eq!(reader.get_f64(1), 3.15);
886        assert_eq!(reader.get_f64(2), 0.0);
887    }
888
889    #[test]
890    fn test_writer_reader_text_roundtrip() {
891        let mut writer = FastFieldWriter::new_text();
892        writer.add_text(0, "banana");
893        writer.add_text(1, "apple");
894        writer.add_text(2, "cherry");
895        writer.add_text(3, "apple"); // duplicate
896        // doc_id=4 has no value
897        writer.pad_to(5);
898
899        let mut buf = Vec::new();
900        let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
901        let ob = owned(buf);
902        let reader = FastFieldReader::open(&ob, &toc).unwrap();
903
904        // Dictionary is sorted: apple=0, banana=1, cherry=2
905        assert_eq!(reader.get_text(0), Some("banana"));
906        assert_eq!(reader.get_text(1), Some("apple"));
907        assert_eq!(reader.get_text(2), Some("cherry"));
908        assert_eq!(reader.get_text(3), Some("apple"));
909        assert_eq!(reader.get_text(4), None); // missing
910
911        // Ordinal lookups
912        assert_eq!(reader.text_ordinal("apple"), Some(0));
913        assert_eq!(reader.text_ordinal("banana"), Some(1));
914        assert_eq!(reader.text_ordinal("cherry"), Some(2));
915        assert_eq!(reader.text_ordinal("durian"), None);
916    }
917
918    #[test]
919    fn test_constant_column() {
920        let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
921        for i in 0..100 {
922            writer.add_u64(i, 42);
923        }
924
925        let mut buf = Vec::new();
926        let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
927        assert_eq!(toc.bits_per_value, 0); // constant — no data bytes
928        assert_eq!(toc.min_value, 42);
929
930        let ob = owned(buf);
931        let reader = FastFieldReader::open(&ob, &toc).unwrap();
932        for i in 0..100 {
933            assert_eq!(reader.get_u64(i), 42);
934        }
935    }
936}
hermes_core/structures/fast_field.rs

hermes_core/structures/
fast_field.rs