Skip to main content

hermes_core/structures/postings/
posting_common.rs

1//! Shared primitives for posting lists
2//!
3//! This module contains common code used by both text posting lists and sparse vector posting lists:
4//! - Variable-length integer encoding (varint)
5//! - Skip list structure for block-based access
6//! - Block constants
7
8use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
9use std::io::{self, Read, Write};
10
11use crate::DocId;
12
13/// Standard block size for posting lists (SIMD-friendly)
14pub const BLOCK_SIZE: usize = 128;
15
16/// Write variable-length integer (1-9 bytes)
17///
18/// Uses continuation bit encoding: 7 bits of data per byte,
19/// high bit indicates more bytes follow.
20#[inline]
21pub fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
22    loop {
23        let byte = (value & 0x7F) as u8;
24        value >>= 7;
25        if value == 0 {
26            writer.write_u8(byte)?;
27            return Ok(());
28        } else {
29            writer.write_u8(byte | 0x80)?;
30        }
31    }
32}
33
34/// Read variable-length integer
35#[inline]
36pub fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
37    let mut result = 0u64;
38    let mut shift = 0;
39
40    loop {
41        let byte = reader.read_u8()?;
42        result |= ((byte & 0x7F) as u64) << shift;
43        if byte & 0x80 == 0 {
44            return Ok(result);
45        }
46        shift += 7;
47        if shift >= 64 {
48            return Err(io::Error::new(
49                io::ErrorKind::InvalidData,
50                "varint too long",
51            ));
52        }
53    }
54}
55
56/// Skip list entry for block-based posting lists
57///
58/// Enables O(log n) seeking by storing metadata for each block.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub struct SkipEntry {
61    /// First doc_id in the block (absolute)
62    pub first_doc: DocId,
63    /// Last doc_id in the block
64    pub last_doc: DocId,
65    /// Byte offset to block data
66    pub offset: u32,
67}
68
69impl SkipEntry {
70    pub fn new(first_doc: DocId, last_doc: DocId, offset: u32) -> Self {
71        Self {
72            first_doc,
73            last_doc,
74            offset,
75        }
76    }
77
78    /// Write skip entry to writer
79    pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
80        writer.write_u32::<LittleEndian>(self.first_doc)?;
81        writer.write_u32::<LittleEndian>(self.last_doc)?;
82        writer.write_u32::<LittleEndian>(self.offset)?;
83        Ok(())
84    }
85
86    /// Read skip entry from reader
87    pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
88        let first_doc = reader.read_u32::<LittleEndian>()?;
89        let last_doc = reader.read_u32::<LittleEndian>()?;
90        let offset = reader.read_u32::<LittleEndian>()?;
91        Ok(Self {
92            first_doc,
93            last_doc,
94            offset,
95        })
96    }
97}
98
99/// Skip list for block-based posting lists
100#[derive(Debug, Clone, Default)]
101pub struct SkipList {
102    entries: Vec<SkipEntry>,
103}
104
105impl SkipList {
106    pub fn new() -> Self {
107        Self::default()
108    }
109
110    pub fn with_capacity(capacity: usize) -> Self {
111        Self {
112            entries: Vec::with_capacity(capacity),
113        }
114    }
115
116    /// Add a skip entry
117    pub fn push(&mut self, first_doc: DocId, last_doc: DocId, offset: u32) {
118        self.entries
119            .push(SkipEntry::new(first_doc, last_doc, offset));
120    }
121
122    /// Number of blocks
123    pub fn len(&self) -> usize {
124        self.entries.len()
125    }
126
127    pub fn is_empty(&self) -> bool {
128        self.entries.is_empty()
129    }
130
131    /// Get entry by index
132    pub fn get(&self, index: usize) -> Option<&SkipEntry> {
133        self.entries.get(index)
134    }
135
136    /// Find block index containing doc_id >= target
137    ///
138    /// Returns None if target is beyond all blocks.
139    pub fn find_block(&self, target: DocId) -> Option<usize> {
140        self.entries.iter().position(|e| e.last_doc >= target)
141    }
142
143    /// Iterate over entries
144    pub fn iter(&self) -> impl Iterator<Item = &SkipEntry> {
145        self.entries.iter()
146    }
147
148    /// Write skip list to writer
149    pub fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
150        writer.write_u32::<LittleEndian>(self.entries.len() as u32)?;
151        for entry in &self.entries {
152            entry.write(writer)?;
153        }
154        Ok(())
155    }
156
157    /// Read skip list from reader
158    pub fn read<R: Read>(reader: &mut R) -> io::Result<Self> {
159        let count = reader.read_u32::<LittleEndian>()? as usize;
160        let mut entries = Vec::with_capacity(count);
161        for _ in 0..count {
162            entries.push(SkipEntry::read(reader)?);
163        }
164        Ok(Self { entries })
165    }
166
167    /// Convert from tuple format (for compatibility)
168    pub fn from_tuples(tuples: &[(DocId, DocId, u32)]) -> Self {
169        Self {
170            entries: tuples
171                .iter()
172                .map(|(first, last, offset)| SkipEntry::new(*first, *last, *offset))
173                .collect(),
174        }
175    }
176
177    /// Convert to tuple format (for compatibility)
178    pub fn to_tuples(&self) -> Vec<(DocId, DocId, u32)> {
179        self.entries
180            .iter()
181            .map(|e| (e.first_doc, e.last_doc, e.offset))
182            .collect()
183    }
184}
185
186/// Write a block of delta-encoded doc_ids
187///
188/// First doc_id is written as absolute value, rest as deltas.
189/// Returns the last doc_id written.
190pub fn write_doc_id_block<W: Write>(writer: &mut W, doc_ids: &[DocId]) -> io::Result<DocId> {
191    if doc_ids.is_empty() {
192        return Ok(0);
193    }
194
195    write_vint(writer, doc_ids.len() as u64)?;
196
197    let mut prev = 0u32;
198    for (i, &doc_id) in doc_ids.iter().enumerate() {
199        if i == 0 {
200            // First doc_id: absolute
201            write_vint(writer, doc_id as u64)?;
202        } else {
203            // Rest: delta from previous
204            write_vint(writer, (doc_id - prev) as u64)?;
205        }
206        prev = doc_id;
207    }
208
209    Ok(*doc_ids.last().unwrap())
210}
211
212/// Read a block of delta-encoded doc_ids
213///
214/// Returns vector of absolute doc_ids.
215pub fn read_doc_id_block<R: Read>(reader: &mut R) -> io::Result<Vec<DocId>> {
216    let count = read_vint(reader)? as usize;
217    let mut doc_ids = Vec::with_capacity(count);
218
219    let mut prev = 0u32;
220    for i in 0..count {
221        let value = read_vint(reader)? as u32;
222        let doc_id = if i == 0 {
223            value // First: absolute
224        } else {
225            prev + value // Rest: delta
226        };
227        doc_ids.push(doc_id);
228        prev = doc_id;
229    }
230
231    Ok(doc_ids)
232}
233
234// ============================================================================
235// Fixed-width bitpacking for SIMD-friendly delta encoding
236// ============================================================================
237
238use crate::structures::simd;
239
240/// Rounded bit width for SIMD-friendly encoding
241///
242/// Values are rounded up to 0, 8, 16, or 32 bits for efficient SIMD unpacking.
243#[derive(Debug, Clone, Copy, PartialEq, Eq)]
244#[repr(u8)]
245pub enum RoundedBitWidth {
246    /// All values are zero (e.g., consecutive doc IDs)
247    Zero = 0,
248    /// 8-bit values (0-255)
249    Bits8 = 8,
250    /// 16-bit values (0-65535)
251    Bits16 = 16,
252    /// 32-bit values
253    Bits32 = 32,
254}
255
256impl RoundedBitWidth {
257    /// Determine the rounded bit width needed for a maximum value
258    pub fn from_max_value(max_val: u32) -> Self {
259        if max_val == 0 {
260            RoundedBitWidth::Zero
261        } else if max_val <= 255 {
262            RoundedBitWidth::Bits8
263        } else if max_val <= 65535 {
264            RoundedBitWidth::Bits16
265        } else {
266            RoundedBitWidth::Bits32
267        }
268    }
269
270    /// Bytes per value
271    pub fn bytes_per_value(&self) -> usize {
272        match self {
273            RoundedBitWidth::Zero => 0,
274            RoundedBitWidth::Bits8 => 1,
275            RoundedBitWidth::Bits16 => 2,
276            RoundedBitWidth::Bits32 => 4,
277        }
278    }
279
280    /// Convert from u8
281    pub fn from_u8(v: u8) -> Option<Self> {
282        match v {
283            0 => Some(RoundedBitWidth::Zero),
284            8 => Some(RoundedBitWidth::Bits8),
285            16 => Some(RoundedBitWidth::Bits16),
286            32 => Some(RoundedBitWidth::Bits32),
287            _ => None,
288        }
289    }
290}
291
292/// Pack delta-encoded doc IDs with fixed-width encoding
293///
294/// Stores (gap - 1) for each delta to save one bit since gaps are always >= 1.
295/// Returns (bit_width, packed_bytes).
296pub fn pack_deltas_fixed(doc_ids: &[DocId]) -> (RoundedBitWidth, Vec<u8>) {
297    if doc_ids.len() <= 1 {
298        return (RoundedBitWidth::Zero, Vec::new());
299    }
300
301    // Compute deltas and find max
302    let mut max_delta = 0u32;
303    let mut deltas = Vec::with_capacity(doc_ids.len() - 1);
304
305    for i in 1..doc_ids.len() {
306        let delta = doc_ids[i] - doc_ids[i - 1] - 1; // Store gap-1
307        deltas.push(delta);
308        max_delta = max_delta.max(delta);
309    }
310
311    let bit_width = RoundedBitWidth::from_max_value(max_delta);
312    let bytes_per_val = bit_width.bytes_per_value();
313
314    if bytes_per_val == 0 {
315        return (bit_width, Vec::new());
316    }
317
318    let mut packed = Vec::with_capacity(deltas.len() * bytes_per_val);
319
320    match bit_width {
321        RoundedBitWidth::Zero => {}
322        RoundedBitWidth::Bits8 => {
323            for delta in deltas {
324                packed.push(delta as u8);
325            }
326        }
327        RoundedBitWidth::Bits16 => {
328            for delta in deltas {
329                packed.extend_from_slice(&(delta as u16).to_le_bytes());
330            }
331        }
332        RoundedBitWidth::Bits32 => {
333            for delta in deltas {
334                packed.extend_from_slice(&delta.to_le_bytes());
335            }
336        }
337    }
338
339    (bit_width, packed)
340}
341
342/// Unpack delta-encoded doc IDs with SIMD acceleration
343///
344/// Uses SIMD for 8/16/32-bit widths, scalar for zero width.
345pub fn unpack_deltas_fixed(
346    packed: &[u8],
347    bit_width: RoundedBitWidth,
348    first_doc_id: DocId,
349    count: usize,
350    output: &mut [DocId],
351) {
352    if count == 0 {
353        return;
354    }
355
356    output[0] = first_doc_id;
357
358    if count == 1 {
359        return;
360    }
361
362    match bit_width {
363        RoundedBitWidth::Zero => {
364            // All gaps are 1 (consecutive doc IDs)
365            for (i, out) in output.iter_mut().enumerate().skip(1).take(count - 1) {
366                *out = first_doc_id + i as u32;
367            }
368        }
369        RoundedBitWidth::Bits8 => {
370            simd::unpack_8bit_delta_decode(packed, output, first_doc_id, count);
371        }
372        RoundedBitWidth::Bits16 => {
373            simd::unpack_16bit_delta_decode(packed, output, first_doc_id, count);
374        }
375        RoundedBitWidth::Bits32 => {
376            // Unpack and delta decode
377            let mut carry = first_doc_id;
378            for i in 0..count - 1 {
379                let idx = i * 4;
380                let delta = u32::from_le_bytes([
381                    packed[idx],
382                    packed[idx + 1],
383                    packed[idx + 2],
384                    packed[idx + 3],
385                ]);
386                carry = carry.wrapping_add(delta).wrapping_add(1);
387                output[i + 1] = carry;
388            }
389        }
390    }
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396
397    #[test]
398    fn test_vint_roundtrip() {
399        let values = [
400            0u64,
401            1,
402            127,
403            128,
404            255,
405            256,
406            16383,
407            16384,
408            u32::MAX as u64,
409            u64::MAX,
410        ];
411
412        for &value in &values {
413            let mut buf = Vec::new();
414            write_vint(&mut buf, value).unwrap();
415            let read_value = read_vint(&mut buf.as_slice()).unwrap();
416            assert_eq!(value, read_value, "Failed for value {}", value);
417        }
418    }
419
420    #[test]
421    fn test_skip_list_roundtrip() {
422        let mut skip_list = SkipList::new();
423        skip_list.push(0, 127, 0);
424        skip_list.push(128, 255, 100);
425        skip_list.push(256, 500, 200);
426
427        let mut buf = Vec::new();
428        skip_list.write(&mut buf).unwrap();
429
430        let restored = SkipList::read(&mut buf.as_slice()).unwrap();
431        assert_eq!(skip_list.len(), restored.len());
432
433        for (a, b) in skip_list.iter().zip(restored.iter()) {
434            assert_eq!(a, b);
435        }
436    }
437
438    #[test]
439    fn test_skip_list_find_block() {
440        let mut skip_list = SkipList::new();
441        skip_list.push(0, 99, 0);
442        skip_list.push(100, 199, 100);
443        skip_list.push(200, 299, 200);
444
445        assert_eq!(skip_list.find_block(0), Some(0));
446        assert_eq!(skip_list.find_block(50), Some(0));
447        assert_eq!(skip_list.find_block(99), Some(0));
448        assert_eq!(skip_list.find_block(100), Some(1));
449        assert_eq!(skip_list.find_block(150), Some(1));
450        assert_eq!(skip_list.find_block(250), Some(2));
451        assert_eq!(skip_list.find_block(300), None);
452    }
453
454    #[test]
455    fn test_doc_id_block_roundtrip() {
456        let doc_ids: Vec<DocId> = vec![0, 5, 10, 100, 1000, 10000];
457
458        let mut buf = Vec::new();
459        let last = write_doc_id_block(&mut buf, &doc_ids).unwrap();
460        assert_eq!(last, 10000);
461
462        let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
463        assert_eq!(doc_ids, restored);
464    }
465
466    #[test]
467    fn test_doc_id_block_single() {
468        let doc_ids: Vec<DocId> = vec![42];
469
470        let mut buf = Vec::new();
471        write_doc_id_block(&mut buf, &doc_ids).unwrap();
472
473        let restored = read_doc_id_block(&mut buf.as_slice()).unwrap();
474        assert_eq!(doc_ids, restored);
475    }
476}