Skip to main content

hermes_core/structures/postings/
posting.rs

1//! Posting list implementation with compact representation
2//!
3//! Uses delta encoding and variable-length integers for compact storage.
4
5use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
6use std::io::{self, Read, Write};
7
8use crate::DocId;
9
10/// A posting entry containing doc_id and term frequency
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct Posting {
13    pub doc_id: DocId,
14    pub term_freq: u32,
15}
16
17/// Compact posting list with delta encoding
18#[derive(Debug, Clone, Default)]
19pub struct PostingList {
20    postings: Vec<Posting>,
21}
22
23impl PostingList {
24    pub fn new() -> Self {
25        Self::default()
26    }
27
28    pub fn with_capacity(capacity: usize) -> Self {
29        Self {
30            postings: Vec::with_capacity(capacity),
31        }
32    }
33
34    /// Add a posting (must be added in doc_id order)
35    pub fn push(&mut self, doc_id: DocId, term_freq: u32) {
36        debug_assert!(
37            self.postings.is_empty() || self.postings.last().unwrap().doc_id < doc_id,
38            "Postings must be added in sorted order"
39        );
40        self.postings.push(Posting { doc_id, term_freq });
41    }
42
43    /// Add a posting, incrementing term_freq if doc already exists
44    pub fn add(&mut self, doc_id: DocId, term_freq: u32) {
45        if let Some(last) = self.postings.last_mut()
46            && last.doc_id == doc_id
47        {
48            last.term_freq += term_freq;
49            return;
50        }
51        self.postings.push(Posting { doc_id, term_freq });
52    }
53
54    /// Get document count
55    pub fn doc_count(&self) -> u32 {
56        self.postings.len() as u32
57    }
58
59    pub fn len(&self) -> usize {
60        self.postings.len()
61    }
62
63    pub fn is_empty(&self) -> bool {
64        self.postings.is_empty()
65    }
66
67    pub fn iter(&self) -> impl Iterator<Item = &Posting> {
68        self.postings.iter()
69    }
70
71    /// Serialize to bytes using delta encoding and varint
72    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
73        // Write number of postings
74        write_vint(writer, self.postings.len() as u64)?;
75
76        let mut prev_doc_id = 0u32;
77        for posting in &self.postings {
78            // Delta encode doc_id
79            let delta = posting.doc_id - prev_doc_id;
80            write_vint(writer, delta as u64)?;
81            write_vint(writer, posting.term_freq as u64)?;
82            prev_doc_id = posting.doc_id;
83        }
84
85        Ok(())
86    }
87
88    /// Deserialize from bytes
89    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
90        let count = read_vint(reader)? as usize;
91        let mut postings = Vec::with_capacity(count);
92
93        let mut prev_doc_id = 0u32;
94        for _ in 0..count {
95            let delta = read_vint(reader)? as u32;
96            let term_freq = read_vint(reader)? as u32;
97            let doc_id = prev_doc_id + delta;
98            postings.push(Posting { doc_id, term_freq });
99            prev_doc_id = doc_id;
100        }
101
102        Ok(Self { postings })
103    }
104}
105
106/// Iterator over posting list that supports seeking
107pub struct PostingListIterator<'a> {
108    postings: &'a [Posting],
109    position: usize,
110}
111
112impl<'a> PostingListIterator<'a> {
113    pub fn new(posting_list: &'a PostingList) -> Self {
114        Self {
115            postings: &posting_list.postings,
116            position: 0,
117        }
118    }
119
120    /// Current document ID, or TERMINATED if exhausted
121    pub fn doc(&self) -> DocId {
122        if self.position < self.postings.len() {
123            self.postings[self.position].doc_id
124        } else {
125            TERMINATED
126        }
127    }
128
129    /// Current term frequency
130    pub fn term_freq(&self) -> u32 {
131        if self.position < self.postings.len() {
132            self.postings[self.position].term_freq
133        } else {
134            0
135        }
136    }
137
138    /// Advance to next posting, returns new doc_id or TERMINATED
139    pub fn advance(&mut self) -> DocId {
140        self.position += 1;
141        self.doc()
142    }
143
144    /// Seek to first doc_id >= target
145    pub fn seek(&mut self, target: DocId) -> DocId {
146        // Binary search for efficiency
147        while self.position < self.postings.len() {
148            if self.postings[self.position].doc_id >= target {
149                return self.postings[self.position].doc_id;
150            }
151            self.position += 1;
152        }
153        TERMINATED
154    }
155
156    /// Size hint for remaining elements
157    pub fn size_hint(&self) -> usize {
158        self.postings.len().saturating_sub(self.position)
159    }
160}
161
162/// Sentinel value indicating iterator is exhausted
163pub const TERMINATED: DocId = DocId::MAX;
164
165/// Write variable-length integer (1-9 bytes)
166fn write_vint<W: Write>(writer: &mut W, mut value: u64) -> io::Result<()> {
167    loop {
168        let byte = (value & 0x7F) as u8;
169        value >>= 7;
170        if value == 0 {
171            writer.write_u8(byte)?;
172            return Ok(());
173        } else {
174            writer.write_u8(byte | 0x80)?;
175        }
176    }
177}
178
179/// Read variable-length integer
180fn read_vint<R: Read>(reader: &mut R) -> io::Result<u64> {
181    let mut result = 0u64;
182    let mut shift = 0;
183
184    loop {
185        let byte = reader.read_u8()?;
186        result |= ((byte & 0x7F) as u64) << shift;
187        if byte & 0x80 == 0 {
188            return Ok(result);
189        }
190        shift += 7;
191        if shift >= 64 {
192            return Err(io::Error::new(
193                io::ErrorKind::InvalidData,
194                "varint too long",
195            ));
196        }
197    }
198}
199
200/// Compact posting list stored as raw bytes (for memory-mapped access)
201#[allow(dead_code)]
202#[derive(Debug, Clone)]
203pub struct CompactPostingList {
204    data: Vec<u8>,
205    doc_count: u32,
206}
207
208#[allow(dead_code)]
209impl CompactPostingList {
210    /// Create from a posting list
211    pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
212        let mut data = Vec::new();
213        list.serialize(&mut data)?;
214        Ok(Self {
215            doc_count: list.len() as u32,
216            data,
217        })
218    }
219
220    /// Get the raw bytes
221    pub fn as_bytes(&self) -> &[u8] {
222        &self.data
223    }
224
225    /// Number of documents in the posting list
226    pub fn doc_count(&self) -> u32 {
227        self.doc_count
228    }
229
230    /// Deserialize back to PostingList
231    pub fn to_posting_list(&self) -> io::Result<PostingList> {
232        PostingList::deserialize(&mut &self.data[..])
233    }
234}
235
236/// Block-based posting list for skip-list style access
237/// Each block contains up to BLOCK_SIZE postings
238pub const BLOCK_SIZE: usize = 128;
239
240#[derive(Debug, Clone)]
241pub struct BlockPostingList {
242    /// Skip list: (base_doc_id, last_doc_id_in_block, byte_offset, block_max_tf)
243    /// base_doc_id is the first doc_id in the block (absolute, not delta)
244    /// block_max_tf enables Block-Max WAND optimization
245    skip_list: Vec<(DocId, DocId, u32, u32)>,
246    /// Compressed posting data
247    data: Vec<u8>,
248    /// Total number of postings
249    doc_count: u32,
250    /// Maximum term frequency across all postings (for WAND upper bound)
251    max_tf: u32,
252}
253
254impl BlockPostingList {
255    /// Build from a posting list
256    pub fn from_posting_list(list: &PostingList) -> io::Result<Self> {
257        let mut skip_list = Vec::new();
258        let mut data = Vec::new();
259        let mut max_tf = 0u32;
260
261        let postings = &list.postings;
262        let mut i = 0;
263
264        while i < postings.len() {
265            let block_start = data.len() as u32;
266            let block_end = (i + BLOCK_SIZE).min(postings.len());
267            let block = &postings[i..block_end];
268
269            // Compute block's max term frequency for Block-Max WAND
270            let block_max_tf = block.iter().map(|p| p.term_freq).max().unwrap_or(0);
271            max_tf = max_tf.max(block_max_tf);
272
273            // Record skip entry with base_doc_id (first doc in block)
274            let base_doc_id = block.first().unwrap().doc_id;
275            let last_doc_id = block.last().unwrap().doc_id;
276            skip_list.push((base_doc_id, last_doc_id, block_start, block_max_tf));
277
278            // Write block - first doc is stored as absolute, rest as deltas
279            write_vint(&mut data, block.len() as u64)?;
280
281            let mut prev_doc_id = base_doc_id;
282            for (j, posting) in block.iter().enumerate() {
283                if j == 0 {
284                    // First doc in block: store absolute (will be adjusted during merge)
285                    write_vint(&mut data, posting.doc_id as u64)?;
286                } else {
287                    // Rest: store delta from previous
288                    let delta = posting.doc_id - prev_doc_id;
289                    write_vint(&mut data, delta as u64)?;
290                }
291                write_vint(&mut data, posting.term_freq as u64)?;
292                prev_doc_id = posting.doc_id;
293            }
294
295            i = block_end;
296        }
297
298        Ok(Self {
299            skip_list,
300            data,
301            doc_count: postings.len() as u32,
302            max_tf,
303        })
304    }
305
306    /// Serialize the block posting list
307    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
308        // Write doc count and max_tf
309        writer.write_u32::<LittleEndian>(self.doc_count)?;
310        writer.write_u32::<LittleEndian>(self.max_tf)?;
311
312        // Write skip list (base_doc_id, last_doc_id, offset, block_max_tf)
313        writer.write_u32::<LittleEndian>(self.skip_list.len() as u32)?;
314        for (base_doc_id, last_doc_id, offset, block_max_tf) in &self.skip_list {
315            writer.write_u32::<LittleEndian>(*base_doc_id)?;
316            writer.write_u32::<LittleEndian>(*last_doc_id)?;
317            writer.write_u32::<LittleEndian>(*offset)?;
318            writer.write_u32::<LittleEndian>(*block_max_tf)?;
319        }
320
321        // Write data
322        writer.write_u32::<LittleEndian>(self.data.len() as u32)?;
323        writer.write_all(&self.data)?;
324
325        Ok(())
326    }
327
328    /// Deserialize
329    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
330        let doc_count = reader.read_u32::<LittleEndian>()?;
331        let max_tf = reader.read_u32::<LittleEndian>()?;
332
333        let skip_count = reader.read_u32::<LittleEndian>()? as usize;
334        let mut skip_list = Vec::with_capacity(skip_count);
335        for _ in 0..skip_count {
336            let base_doc_id = reader.read_u32::<LittleEndian>()?;
337            let last_doc_id = reader.read_u32::<LittleEndian>()?;
338            let offset = reader.read_u32::<LittleEndian>()?;
339            let block_max_tf = reader.read_u32::<LittleEndian>()?;
340            skip_list.push((base_doc_id, last_doc_id, offset, block_max_tf));
341        }
342
343        let data_len = reader.read_u32::<LittleEndian>()? as usize;
344        let mut data = vec![0u8; data_len];
345        reader.read_exact(&mut data)?;
346
347        Ok(Self {
348            skip_list,
349            data,
350            max_tf,
351            doc_count,
352        })
353    }
354
355    pub fn doc_count(&self) -> u32 {
356        self.doc_count
357    }
358
359    /// Get maximum term frequency (for WAND upper bound computation)
360    pub fn max_tf(&self) -> u32 {
361        self.max_tf
362    }
363
364    /// Get number of blocks
365    pub fn num_blocks(&self) -> usize {
366        self.skip_list.len()
367    }
368
369    /// Get block metadata: (base_doc_id, last_doc_id, data_offset, data_len, block_max_tf)
370    pub fn block_info(&self, block_idx: usize) -> Option<(DocId, DocId, usize, usize, u32)> {
371        if block_idx >= self.skip_list.len() {
372            return None;
373        }
374        let (base, last, offset, block_max_tf) = self.skip_list[block_idx];
375        let next_offset = if block_idx + 1 < self.skip_list.len() {
376            self.skip_list[block_idx + 1].2 as usize
377        } else {
378            self.data.len()
379        };
380        Some((
381            base,
382            last,
383            offset as usize,
384            next_offset - offset as usize,
385            block_max_tf,
386        ))
387    }
388
389    /// Get block's max term frequency for Block-Max WAND
390    pub fn block_max_tf(&self, block_idx: usize) -> Option<u32> {
391        self.skip_list
392            .get(block_idx)
393            .map(|(_, _, _, max_tf)| *max_tf)
394    }
395
396    /// Get raw block data for direct copying during merge
397    pub fn block_data(&self, block_idx: usize) -> Option<&[u8]> {
398        let (_, _, offset, len, _) = self.block_info(block_idx)?;
399        Some(&self.data[offset..offset + len])
400    }
401
402    /// Concatenate blocks from multiple posting lists with doc_id remapping
403    /// This is O(num_blocks) instead of O(num_postings)
404    pub fn concatenate_blocks(sources: &[(BlockPostingList, u32)]) -> io::Result<Self> {
405        let mut skip_list = Vec::new();
406        let mut data = Vec::new();
407        let mut total_docs = 0u32;
408        let mut max_tf = 0u32;
409
410        for (source, doc_offset) in sources {
411            max_tf = max_tf.max(source.max_tf);
412            for block_idx in 0..source.num_blocks() {
413                if let Some((base, last, src_offset, len, block_max_tf)) =
414                    source.block_info(block_idx)
415                {
416                    let new_base = base + doc_offset;
417                    let new_last = last + doc_offset;
418                    let new_offset = data.len() as u32;
419
420                    // Copy block data, but we need to adjust the first doc_id in the block
421                    let block_bytes = &source.data[src_offset..src_offset + len];
422                    let mut reader = block_bytes;
423
424                    // Read count
425                    let count = read_vint(&mut reader)? as usize;
426
427                    // Write count
428                    write_vint(&mut data, count as u64)?;
429
430                    // Read and adjust first doc_id
431                    let first_doc = read_vint(&mut reader)? as u32;
432                    let first_tf = read_vint(&mut reader)?;
433                    write_vint(&mut data, (first_doc + doc_offset) as u64)?;
434                    write_vint(&mut data, first_tf)?;
435
436                    // Copy remaining postings unchanged (they're deltas)
437                    data.extend_from_slice(reader);
438
439                    skip_list.push((new_base, new_last, new_offset, block_max_tf));
440                    total_docs += count as u32;
441                }
442            }
443        }
444
445        Ok(Self {
446            skip_list,
447            data,
448            doc_count: total_docs,
449            max_tf,
450        })
451    }
452
453    /// Create an iterator with skip support
454    pub fn iterator(&self) -> BlockPostingIterator<'_> {
455        BlockPostingIterator::new(self)
456    }
457
458    /// Create an owned iterator that doesn't borrow self
459    pub fn into_iterator(self) -> BlockPostingIterator<'static> {
460        BlockPostingIterator::owned(self)
461    }
462}
463
464/// Iterator over block posting list with skip support
465/// Can be either borrowed or owned via Cow
466pub struct BlockPostingIterator<'a> {
467    block_list: std::borrow::Cow<'a, BlockPostingList>,
468    current_block: usize,
469    block_postings: Vec<Posting>,
470    position_in_block: usize,
471    exhausted: bool,
472}
473
474/// Type alias for owned iterator
475#[allow(dead_code)]
476pub type OwnedBlockPostingIterator = BlockPostingIterator<'static>;
477
478impl<'a> BlockPostingIterator<'a> {
479    fn new(block_list: &'a BlockPostingList) -> Self {
480        let exhausted = block_list.skip_list.is_empty();
481        let mut iter = Self {
482            block_list: std::borrow::Cow::Borrowed(block_list),
483            current_block: 0,
484            block_postings: Vec::new(),
485            position_in_block: 0,
486            exhausted,
487        };
488        if !iter.exhausted {
489            iter.load_block(0);
490        }
491        iter
492    }
493
494    fn owned(block_list: BlockPostingList) -> BlockPostingIterator<'static> {
495        let exhausted = block_list.skip_list.is_empty();
496        let mut iter = BlockPostingIterator {
497            block_list: std::borrow::Cow::Owned(block_list),
498            current_block: 0,
499            block_postings: Vec::new(),
500            position_in_block: 0,
501            exhausted,
502        };
503        if !iter.exhausted {
504            iter.load_block(0);
505        }
506        iter
507    }
508
509    fn load_block(&mut self, block_idx: usize) {
510        if block_idx >= self.block_list.skip_list.len() {
511            self.exhausted = true;
512            return;
513        }
514
515        self.current_block = block_idx;
516        self.position_in_block = 0;
517
518        let offset = self.block_list.skip_list[block_idx].2 as usize;
519        let mut reader = &self.block_list.data[offset..];
520
521        let count = read_vint(&mut reader).unwrap_or(0) as usize;
522        self.block_postings.clear();
523        self.block_postings.reserve(count);
524
525        let mut prev_doc_id = 0u32;
526
527        for i in 0..count {
528            if let (Ok(value), Ok(tf)) = (read_vint(&mut reader), read_vint(&mut reader)) {
529                let doc_id = if i == 0 {
530                    // First doc in block: absolute value
531                    value as u32
532                } else {
533                    // Rest: delta from previous
534                    prev_doc_id + value as u32
535                };
536                self.block_postings.push(Posting {
537                    doc_id,
538                    term_freq: tf as u32,
539                });
540                prev_doc_id = doc_id;
541            }
542        }
543    }
544
545    pub fn doc(&self) -> DocId {
546        if self.exhausted {
547            TERMINATED
548        } else if self.position_in_block < self.block_postings.len() {
549            self.block_postings[self.position_in_block].doc_id
550        } else {
551            TERMINATED
552        }
553    }
554
555    pub fn term_freq(&self) -> u32 {
556        if self.exhausted || self.position_in_block >= self.block_postings.len() {
557            0
558        } else {
559            self.block_postings[self.position_in_block].term_freq
560        }
561    }
562
563    pub fn advance(&mut self) -> DocId {
564        if self.exhausted {
565            return TERMINATED;
566        }
567
568        self.position_in_block += 1;
569        if self.position_in_block >= self.block_postings.len() {
570            self.load_block(self.current_block + 1);
571        }
572        self.doc()
573    }
574
575    pub fn seek(&mut self, target: DocId) -> DocId {
576        if self.exhausted {
577            return TERMINATED;
578        }
579
580        let target_block = self
581            .block_list
582            .skip_list
583            .iter()
584            .position(|(_, last_doc, _, _)| *last_doc >= target);
585
586        if let Some(block_idx) = target_block {
587            if block_idx != self.current_block {
588                self.load_block(block_idx);
589            }
590
591            while self.position_in_block < self.block_postings.len() {
592                if self.block_postings[self.position_in_block].doc_id >= target {
593                    return self.doc();
594                }
595                self.position_in_block += 1;
596            }
597
598            self.load_block(self.current_block + 1);
599            self.seek(target)
600        } else {
601            self.exhausted = true;
602            TERMINATED
603        }
604    }
605
606    /// Skip to the next block, returning the first doc_id in the new block
607    /// This is used for block-max WAND optimization when the current block's
608    /// max score can't beat the threshold.
609    pub fn skip_to_next_block(&mut self) -> DocId {
610        if self.exhausted {
611            return TERMINATED;
612        }
613        self.load_block(self.current_block + 1);
614        self.doc()
615    }
616
617    /// Get the current block index
618    #[inline]
619    pub fn current_block_idx(&self) -> usize {
620        self.current_block
621    }
622
623    /// Get total number of blocks
624    #[inline]
625    pub fn num_blocks(&self) -> usize {
626        self.block_list.skip_list.len()
627    }
628
629    /// Get the current block's max term frequency for Block-Max WAND
630    #[inline]
631    pub fn current_block_max_tf(&self) -> u32 {
632        if self.exhausted || self.current_block >= self.block_list.skip_list.len() {
633            0
634        } else {
635            self.block_list.skip_list[self.current_block].3
636        }
637    }
638}
639
640#[cfg(test)]
641mod tests {
642    use super::*;
643
644    #[test]
645    fn test_posting_list_basic() {
646        let mut list = PostingList::new();
647        list.push(1, 2);
648        list.push(5, 1);
649        list.push(10, 3);
650
651        assert_eq!(list.len(), 3);
652
653        let mut iter = PostingListIterator::new(&list);
654        assert_eq!(iter.doc(), 1);
655        assert_eq!(iter.term_freq(), 2);
656
657        assert_eq!(iter.advance(), 5);
658        assert_eq!(iter.term_freq(), 1);
659
660        assert_eq!(iter.advance(), 10);
661        assert_eq!(iter.term_freq(), 3);
662
663        assert_eq!(iter.advance(), TERMINATED);
664    }
665
666    #[test]
667    fn test_posting_list_serialization() {
668        let mut list = PostingList::new();
669        for i in 0..100 {
670            list.push(i * 3, (i % 5) + 1);
671        }
672
673        let mut buffer = Vec::new();
674        list.serialize(&mut buffer).unwrap();
675
676        let deserialized = PostingList::deserialize(&mut &buffer[..]).unwrap();
677        assert_eq!(deserialized.len(), list.len());
678
679        for (a, b) in list.iter().zip(deserialized.iter()) {
680            assert_eq!(a, b);
681        }
682    }
683
684    #[test]
685    fn test_posting_list_seek() {
686        let mut list = PostingList::new();
687        for i in 0..100 {
688            list.push(i * 2, 1);
689        }
690
691        let mut iter = PostingListIterator::new(&list);
692
693        assert_eq!(iter.seek(50), 50);
694        assert_eq!(iter.seek(51), 52);
695        assert_eq!(iter.seek(200), TERMINATED);
696    }
697
698    #[test]
699    fn test_block_posting_list() {
700        let mut list = PostingList::new();
701        for i in 0..500 {
702            list.push(i * 2, (i % 10) + 1);
703        }
704
705        let block_list = BlockPostingList::from_posting_list(&list).unwrap();
706        assert_eq!(block_list.doc_count(), 500);
707
708        let mut iter = block_list.iterator();
709        assert_eq!(iter.doc(), 0);
710        assert_eq!(iter.term_freq(), 1);
711
712        // Test seek across blocks
713        assert_eq!(iter.seek(500), 500);
714        assert_eq!(iter.seek(998), 998);
715        assert_eq!(iter.seek(1000), TERMINATED);
716    }
717
718    #[test]
719    fn test_block_posting_list_serialization() {
720        let mut list = PostingList::new();
721        for i in 0..300 {
722            list.push(i * 3, i + 1);
723        }
724
725        let block_list = BlockPostingList::from_posting_list(&list).unwrap();
726
727        let mut buffer = Vec::new();
728        block_list.serialize(&mut buffer).unwrap();
729
730        let deserialized = BlockPostingList::deserialize(&mut &buffer[..]).unwrap();
731        assert_eq!(deserialized.doc_count(), block_list.doc_count());
732
733        // Verify iteration produces same results
734        let mut iter1 = block_list.iterator();
735        let mut iter2 = deserialized.iterator();
736
737        while iter1.doc() != TERMINATED {
738            assert_eq!(iter1.doc(), iter2.doc());
739            assert_eq!(iter1.term_freq(), iter2.term_freq());
740            iter1.advance();
741            iter2.advance();
742        }
743        assert_eq!(iter2.doc(), TERMINATED);
744    }
745}