Skip to main content

hermes_core/structures/postings/
posting_common.rs

1//! Shared primitives for posting lists
2//!
3//! This module contains common code used by both text posting lists and sparse vector posting lists:
4//! - Variable-length integer encoding (varint)
5//! - Skip list structure for block-based access
6//! - Block constants
7
8use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use crate::DocId;
12
13/// Standard block size for posting lists (SIMD-friendly)
14pub const BLOCK_SIZE: usize = 128;
15
16/// Write variable-length integer (1-9 bytes)
17///
18/// Uses continuation bit encoding: 7 bits of data per byte,
19/// high bit indicates more bytes follow.
20#[inline]
21pub fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
22    loop {
23        let byte = (value & 0x7F) as u8;
24        value >>= 7;
25        if value == 0 {
26            writer.write_u8(byte)?;
27            return Ok(());
28        } else {
29            writer.write_u8(byte | 0x80)?;
30        }
31    }
32}
33
34/// Read variable-length integer
35#[inline]
36pub fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
37    let mut result = 0u64;
38    let mut shift = 0;
39
40    loop {
41        let byte = reader.read_u8()?;
42        result |= ((byte & 0x7F) as u64) << shift;
43        if byte & 0x80 == 0 {
44            return Ok(result);
45        }
46        shift += 7;
47        if shift >= 64 {
48            return Err(io::Error::new(
49                io::ErrorKind::InvalidData,
50                "varint too long",
51            ));
52        }
53    }
54}
55
56/// Skip list entry for block-based posting lists
57///
58/// Enables O(log n) seeking by storing metadata for each block.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub struct SkipEntry {
61    /// First doc_id in the block (absolute)
62    pub first_doc: DocId,
63    /// Last doc_id in the block
64    pub last_doc: DocId,
65    /// Byte offset to block data
66    pub offset: u32,
67}
68
69impl SkipEntry {
70    pub fn new(first_doc: DocId, last_doc: DocId, offset: u32) -> Self {
71        Self {
72            first_doc,
73            last_doc,
74            offset,
75        }
76    }
77
78    /// Write skip entry to writer
79    pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
80        writer.write_u32::<LittleEndian>(self.first_doc)?;
81        writer.write_u32::<LittleEndian>(self.last_doc)?;
82        writer.write_u32::<LittleEndian>(self.offset)?;
83        Ok(())
84    }
85
86    /// Read skip entry from reader
87    pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
88        let first_doc = reader.read_u32::<LittleEndian>()?;
89        let last_doc = reader.read_u32::<LittleEndian>()?;
90        let offset = reader.read_u32::<LittleEndian>()?;
91        Ok(Self {
92            first_doc,
93            last_doc,
94            offset,
95        })
96    }
97}
98
99/// Skip list for block-based posting lists
100#[derive(Debug, Clone, Default)]
101pub struct SkipList {
102    entries: Vec<SkipEntry>,
103}
104
105impl SkipList {
106    pub fn new() -> Self {
107        Self::default()
108    }
109
110    pub fn with_capacity(capacity: usize) -> Self {
111        Self {
112            entries: Vec::with_capacity(capacity),
113        }
114    }
115
116    /// Add a skip entry
117    pub fn push(&mut self, first_doc: DocId, last_doc: DocId, offset: u32) {
118        self.entries
119            .push(SkipEntry::new(first_doc, last_doc, offset));
120    }
121
122    /// Number of blocks
123    pub fn len(&self) -> usize {
124        self.entries.len()
125    }
126
127    pub fn is_empty(&self) -> bool {
128        self.entries.is_empty()
129    }
130
131    /// Get entry by index
132    pub fn get(&self, index: usize) -> Option<&SkipEntry> {
133        self.entries.get(index)
134    }
135
136    /// Find block index containing doc_id >= target.
137    /// Uses binary search on monotonically increasing `last_doc` values.
138    ///
139    /// Returns None if target is beyond all blocks.
140    pub fn find_block(&self, target: DocId) -> Option<usize> {
141        let idx = self.entries.partition_point(|e| e.last_doc < target);
142        if idx < self.entries.len() {
143            Some(idx)
144        } else {
145            None
146        }
147    }
148
149    /// Iterate over entries
150    pub fn iter(&self) -> impl Iterator<Item = &SkipEntry> {
151        self.entries.iter()
152    }
153
154    /// Write skip list to writer
155    pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
156        writer.write_u32::<LittleEndian>(self.entries.len() as u32)?;
157        for entry in &self.entries {
158            entry.write(writer)?;
159        }
160        Ok(())
161    }
162
163    /// Read skip list from reader
164    pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
165        let count = reader.read_u32::<LittleEndian>()? as usize;
166        let mut entries = Vec::with_capacity(count);
167        for _ in 0..count {
168            entries.push(SkipEntry::read(reader)?);
169        }
170        Ok(Self { entries })
171    }
172
173    /// Convert from tuple format (for compatibility)
174    pub fn from_tuples(tuples: &[(DocId, DocId, u32)]) -> Self {
175        Self {
176            entries: tuples
177                .iter()
178                .map(|(first, last, offset)| SkipEntry::new(*first, *last, *offset))
179                .collect(),
180        }
181    }
182
183    /// Convert to tuple format (for compatibility)
184    pub fn to_tuples(&self) -> Vec<(DocId, DocId, u32)> {
185        self.entries
186            .iter()
187            .map(|e| (e.first_doc, e.last_doc, e.offset))
188            .collect()
189    }
190}
191
192/// Write a block of delta-encoded doc_ids
193///
194/// First doc_id is written as absolute value, rest as deltas.
195/// Returns the last doc_id written.
196pub fn write_doc_id_block<W: Write>(writer: &mut W, doc_ids: &[DocId]) -> io::Result<DocId> {
197    if doc_ids.is_empty() {
198        return Ok(0);
199    }
200
201    write_vint(writer, doc_ids.len() as u64)?;
202
203    let mut prev = 0u32;
204    for (i, &doc_id) in doc_ids.iter().enumerate() {
205        if i == 0 {
206            // First doc_id: absolute
207            write_vint(writer, doc_id as u64)?;
208        } else {
209            // Rest: delta from previous
210            write_vint(writer, (doc_id - prev) as u64)?;
211        }
212        prev = doc_id;
213    }
214
215    Ok(*doc_ids.last().unwrap())
216}
217
218/// Read a block of delta-encoded doc_ids
219///
220/// Returns vector of absolute doc_ids.
221pub fn read_doc_id_block<R: Read>(reader: &mut R) -> io::Result<Vec<DocId>> {
222    let count = read_vint(reader)? as usize;
223    let mut doc_ids = Vec::with_capacity(count);
224
225    let mut prev = 0u32;
226    for i in 0..count {
227        let value = read_vint(reader)? as u32;
228        let doc_id = if i == 0 {
229            value // First: absolute
230        } else {
231            prev + value // Rest: delta
232        };
233        doc_ids.push(doc_id);
234        prev = doc_id;
235    }
236
237    Ok(doc_ids)
238}
239
240// ============================================================================
241// Fixed-width bitpacking for SIMD-friendly delta encoding
242// ============================================================================
243
244use crate::structures::simd;
245
246/// Rounded bit width for SIMD-friendly encoding
247///
248/// Values are rounded up to 0, 8, 16, or 32 bits for efficient SIMD unpacking.
249#[derive(Debug, Clone, Copy, PartialEq, Eq)]
250#[repr(u8)]
251pub enum RoundedBitWidth {
252    /// All values are zero (e.g., consecutive doc IDs)
253    Zero = 0,
254    /// 8-bit values (0-255)
255    Bits8 = 8,
256    /// 16-bit values (0-65535)
257    Bits16 = 16,
258    /// 32-bit values
259    Bits32 = 32,
260}
261
262impl RoundedBitWidth {
263    /// Determine the rounded bit width needed for a maximum value
264    pub fn from_max_value(max_val: u32) -> Self {
265        if max_val == 0 {
266            RoundedBitWidth::Zero
267        } else if max_val <= 255 {
268            RoundedBitWidth::Bits8
269        } else if max_val <= 65535 {
270            RoundedBitWidth::Bits16
271        } else {
272            RoundedBitWidth::Bits32
273        }
274    }
275
276    /// Bytes per value
277    pub fn bytes_per_value(&self) -> usize {
278        match self {
279            RoundedBitWidth::Zero => 0,
280            RoundedBitWidth::Bits8 => 1,
281            RoundedBitWidth::Bits16 => 2,
282            RoundedBitWidth::Bits32 => 4,
283        }
284    }
285
286    /// Convert from u8
287    pub fn from_u8(v: u8) -> Option<Self> {
288        match v {
289            0 => Some(RoundedBitWidth::Zero),
290            8 => Some(RoundedBitWidth::Bits8),
291            16 => Some(RoundedBitWidth::Bits16),
292            32 => Some(RoundedBitWidth::Bits32),
293            _ => None,
294        }
295    }
296}
297
298/// Pack delta-encoded doc IDs with fixed-width encoding
299///
300/// Stores (gap - 1) for each delta to save one bit since gaps are always >= 1.
301/// Returns (bit_width, packed_bytes).
302pub fn pack_deltas_fixed(doc_ids: &[DocId]) -> (RoundedBitWidth, Vec<u8>) {
303    if doc_ids.len() <= 1 {
304        return (RoundedBitWidth::Zero, Vec::new());
305    }
306
307    // Compute deltas and find max
308    let mut max_delta = 0u32;
309    let mut deltas = Vec::with_capacity(doc_ids.len() - 1);
310
311    for i in 1..doc_ids.len() {
312        let delta = doc_ids[i] - doc_ids[i - 1] - 1; // Store gap-1
313        deltas.push(delta);
314        max_delta = max_delta.max(delta);
315    }
316
317    let bit_width = RoundedBitWidth::from_max_value(max_delta);
318    let bytes_per_val = bit_width.bytes_per_value();
319
320    if bytes_per_val == 0 {
321        return (bit_width, Vec::new());
322    }
323
324    let mut packed = Vec::with_capacity(deltas.len() * bytes_per_val);
325
326    match bit_width {
327        RoundedBitWidth::Zero => {}
328        RoundedBitWidth::Bits8 => {
329            for delta in deltas {
330                packed.push(delta as u8);
331            }
332        }
333        RoundedBitWidth::Bits16 => {
334            for delta in deltas {
335                packed.extend_from_slice(&(delta as u16).to_le_bytes());
336            }
337        }
338        RoundedBitWidth::Bits32 => {
339            for delta in deltas {
340                packed.extend_from_slice(&delta.to_le_bytes());
341            }
342        }
343    }
344
345    (bit_width, packed)
346}
347
348/// Unpack delta-encoded doc IDs with SIMD acceleration
349///
350/// Uses SIMD for 8/16/32-bit widths, scalar for zero width.
351pub fn unpack_deltas_fixed(
352    packed: &[u8],
353    bit_width: RoundedBitWidth,
354    first_doc_id: DocId,
355    count: usize,
356    output: &mut [DocId],
357) {
358    if count == 0 {
359        return;
360    }
361
362    output[0] = first_doc_id;
363
364    if count == 1 {
365        return;
366    }
367
368    match bit_width {
369        RoundedBitWidth::Zero => {
370            // All gaps are 1 (consecutive doc IDs)
371            for (i, out) in output.iter_mut().enumerate().skip(1).take(count - 1) {
372                *out = first_doc_id + i as u32;
373            }
374        }
375        RoundedBitWidth::Bits8 => {
376            simd::unpack_8bit_delta_decode(packed, output, first_doc_id, count);
377        }
378        RoundedBitWidth::Bits16 => {
379            simd::unpack_16bit_delta_decode(packed, output, first_doc_id, count);
380        }
381        RoundedBitWidth::Bits32 => {
382            // Unpack and delta decode
383            let mut carry = first_doc_id;
384            for i in 0..count - 1 {
385                let idx = i * 4;
386                let delta = u32::from_le_bytes([
387                    packed[idx],
388                    packed[idx + 1],
389                    packed[idx + 2],
390                    packed[idx + 3],
391                ]);
392                carry = carry.wrapping_add(delta).wrapping_add(1);
393                output[i + 1] = carry;
394            }
395        }
396    }
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_vint_roundtrip() {
405        let values = [
406            0u64,
407            1,
408            127,
409            128,
410            255,
411            256,
412            16383,
413            16384,
414            u32::MAX as u64,
415            u64::MAX,
416        ];
417
418        for &value in &values {
419            let mut buf = Vec::new();
420            write_vint(&mut buf, value).unwrap();
421            let read_value = read_vint(&mut buf.as_slice()).unwrap();
422            assert_eq!(value, read_value, "Failed for value {}", value);
423        }
424    }
425
426    #[test]
427    fn test_skip_list_roundtrip() {
428        let mut skip_list = SkipList::new();
429        skip_list.push(0, 127, 0);
430        skip_list.push(128, 255, 100);
431        skip_list.push(256, 500, 200);
432
433        let mut buf = Vec::new();
434        skip_list.write(&mut buf).unwrap();
435
436        let restored = SkipList::read(&mut buf.as_slice()).unwrap();
437        assert_eq!(skip_list.len(), restored.len());
438
439        for (a, b) in skip_list.iter().zip(restored.iter()) {
440            assert_eq!(a, b);
441        }
442    }
443
444    #[test]
445    fn test_skip_list_find_block() {
446        let mut skip_list = SkipList::new();
447        skip_list.push(0, 99, 0);
448        skip_list.push(100, 199, 100);
449        skip_list.push(200, 299, 200);
450
451        assert_eq!(skip_list.find_block(0), Some(0));
452        assert_eq!(skip_list.find_block(50), Some(0));
453        assert_eq!(skip_list.find_block(99), Some(0));
454        assert_eq!(skip_list.find_block(100), Some(1));
455        assert_eq!(skip_list.find_block(150), Some(1));
456        assert_eq!(skip_list.find_block(250), Some(2));
457        assert_eq!(skip_list.find_block(300), None);
458    }
459
460    #[test]
461    fn test_doc_id_block_roundtrip() {
462        let doc_ids: Vec<DocId> = vec![0, 5, 10, 100, 1000, 10000];
463
464        let mut buf = Vec::new();
465        let last = write_doc_id_block(&mut buf, &doc_ids).unwrap();
466        assert_eq!(last, 10000);
467
468        let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
469        assert_eq!(doc_ids, restored);
470    }
471
472    #[test]
473    fn test_doc_id_block_single() {
474        let doc_ids: Vec<DocId> = vec![42];
475
476        let mut buf = Vec::new();
477        write_doc_id_block(&mut buf, &doc_ids).unwrap();
478
479        let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
480        assert_eq!(doc_ids, restored);
481    }
482}