hermes_core/structures/
posting_format.rs

1//! Unified posting list format with automatic compression selection
2//!
3//! Automatically selects the best compression method based on posting list characteristics:
4//! - **Inline**: 1-3 postings stored directly in TermInfo (no separate I/O)
5//! - **Bitpacked**: Standard block-based compression for medium lists
6//! - **Elias-Fano**: Near-optimal compression for long lists (>10K docs)
7//! - **Roaring**: Bitmap-based for very frequent terms (>1% of corpus)
8
9use byteorder::{ReadBytesExt, WriteBytesExt};
10use std::io::{self, Read, Write};
11
12use super::bitpacking::BitpackedPostingList;
13use super::elias_fano::EliasFanoPostingList;
14use super::roaring::RoaringPostingList;
15
16/// Thresholds for format selection
17pub const INLINE_THRESHOLD: usize = 3;
18pub const ELIAS_FANO_THRESHOLD: usize = 10_000;
19pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; // 1% of corpus
20
21/// Format tag bytes for serialization
22const FORMAT_BITPACKED: u8 = 0;
23const FORMAT_ELIAS_FANO: u8 = 1;
24const FORMAT_ROARING: u8 = 2;
25
26/// Posting list format selector
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum PostingFormat {
29    /// Standard bitpacked format (default)
30    Bitpacked,
31    /// Elias-Fano for long posting lists
32    EliasFano,
33    /// Roaring bitmap for very frequent terms
34    Roaring,
35}
36
37impl PostingFormat {
38    /// Select optimal format based on posting list characteristics
39    pub fn select(doc_count: usize, total_docs: usize) -> Self {
40        let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
41
42        if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count > ELIAS_FANO_THRESHOLD {
43            // Very frequent term - use Roaring for fast intersections
44            PostingFormat::Roaring
45        } else if doc_count >= ELIAS_FANO_THRESHOLD {
46            // Long posting list - use Elias-Fano for better compression
47            PostingFormat::EliasFano
48        } else {
49            // Standard case - use bitpacked
50            PostingFormat::Bitpacked
51        }
52    }
53}
54
55/// Unified posting list that can use any compression format
56#[derive(Debug, Clone)]
57pub enum CompressedPostingList {
58    Bitpacked(BitpackedPostingList),
59    EliasFano(EliasFanoPostingList),
60    Roaring(RoaringPostingList),
61}
62
63impl CompressedPostingList {
64    /// Create from raw postings with automatic format selection
65    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
66        let format = PostingFormat::select(doc_ids.len(), total_docs);
67
68        match format {
69            PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
70                BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
71            ),
72            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
73                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
74            ),
75            PostingFormat::Roaring => CompressedPostingList::Roaring(
76                RoaringPostingList::from_postings(doc_ids, term_freqs),
77            ),
78        }
79    }
80
81    /// Create with explicit format selection
82    pub fn from_postings_with_format(
83        doc_ids: &[u32],
84        term_freqs: &[u32],
85        format: PostingFormat,
86        idf: f32,
87    ) -> Self {
88        match format {
89            PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
90                BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
91            ),
92            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
93                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
94            ),
95            PostingFormat::Roaring => CompressedPostingList::Roaring(
96                RoaringPostingList::from_postings(doc_ids, term_freqs),
97            ),
98        }
99    }
100
101    /// Get document count
102    pub fn doc_count(&self) -> u32 {
103        match self {
104            CompressedPostingList::Bitpacked(p) => p.doc_count,
105            CompressedPostingList::EliasFano(p) => p.len(),
106            CompressedPostingList::Roaring(p) => p.len(),
107        }
108    }
109
110    /// Get maximum term frequency
111    pub fn max_tf(&self) -> u32 {
112        match self {
113            CompressedPostingList::Bitpacked(p) => p.max_score as u32, // Approximation
114            CompressedPostingList::EliasFano(p) => p.max_tf,
115            CompressedPostingList::Roaring(p) => p.max_tf,
116        }
117    }
118
119    /// Get format type
120    pub fn format(&self) -> PostingFormat {
121        match self {
122            CompressedPostingList::Bitpacked(_) => PostingFormat::Bitpacked,
123            CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
124            CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
125        }
126    }
127
128    /// Serialize
129    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
130        match self {
131            CompressedPostingList::Bitpacked(p) => {
132                writer.write_u8(FORMAT_BITPACKED)?;
133                p.serialize(writer)
134            }
135            CompressedPostingList::EliasFano(p) => {
136                writer.write_u8(FORMAT_ELIAS_FANO)?;
137                p.serialize(writer)
138            }
139            CompressedPostingList::Roaring(p) => {
140                writer.write_u8(FORMAT_ROARING)?;
141                p.serialize(writer)
142            }
143        }
144    }
145
146    /// Deserialize
147    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
148        let format = reader.read_u8()?;
149        match format {
150            FORMAT_BITPACKED => Ok(CompressedPostingList::Bitpacked(
151                BitpackedPostingList::deserialize(reader)?,
152            )),
153            FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
154                EliasFanoPostingList::deserialize(reader)?,
155            )),
156            FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
157                RoaringPostingList::deserialize(reader)?,
158            )),
159            _ => Err(io::Error::new(
160                io::ErrorKind::InvalidData,
161                format!("Unknown posting list format: {}", format),
162            )),
163        }
164    }
165
166    /// Create an iterator
167    pub fn iterator(&self) -> CompressedPostingIterator<'_> {
168        match self {
169            CompressedPostingList::Bitpacked(p) => {
170                CompressedPostingIterator::Bitpacked(p.iterator())
171            }
172            CompressedPostingList::EliasFano(p) => {
173                CompressedPostingIterator::EliasFano(p.iterator())
174            }
175            CompressedPostingList::Roaring(p) => {
176                let mut iter = p.iterator();
177                iter.init();
178                CompressedPostingIterator::Roaring(iter)
179            }
180        }
181    }
182}
183
184/// Unified iterator over any posting list format
185pub enum CompressedPostingIterator<'a> {
186    Bitpacked(super::bitpacking::BitpackedPostingIterator<'a>),
187    EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
188    Roaring(super::roaring::RoaringPostingIterator<'a>),
189}
190
191impl<'a> CompressedPostingIterator<'a> {
192    /// Current document ID
193    pub fn doc(&self) -> u32 {
194        match self {
195            CompressedPostingIterator::Bitpacked(i) => i.doc(),
196            CompressedPostingIterator::EliasFano(i) => i.doc(),
197            CompressedPostingIterator::Roaring(i) => i.doc(),
198        }
199    }
200
201    /// Current term frequency
202    pub fn term_freq(&self) -> u32 {
203        match self {
204            CompressedPostingIterator::Bitpacked(i) => i.term_freq(),
205            CompressedPostingIterator::EliasFano(i) => i.term_freq(),
206            CompressedPostingIterator::Roaring(i) => i.term_freq(),
207        }
208    }
209
210    /// Advance to next document
211    pub fn advance(&mut self) -> u32 {
212        match self {
213            CompressedPostingIterator::Bitpacked(i) => i.advance(),
214            CompressedPostingIterator::EliasFano(i) => i.advance(),
215            CompressedPostingIterator::Roaring(i) => i.advance(),
216        }
217    }
218
219    /// Seek to first doc >= target
220    pub fn seek(&mut self, target: u32) -> u32 {
221        match self {
222            CompressedPostingIterator::Bitpacked(i) => i.seek(target),
223            CompressedPostingIterator::EliasFano(i) => i.seek(target),
224            CompressedPostingIterator::Roaring(i) => i.seek(target),
225        }
226    }
227
228    /// Check if exhausted
229    pub fn is_exhausted(&self) -> bool {
230        self.doc() == u32::MAX
231    }
232}
233
234/// Statistics about compression format usage
235#[derive(Debug, Default, Clone)]
236pub struct CompressionStats {
237    pub bitpacked_count: u32,
238    pub bitpacked_docs: u64,
239    pub elias_fano_count: u32,
240    pub elias_fano_docs: u64,
241    pub roaring_count: u32,
242    pub roaring_docs: u64,
243    pub inline_count: u32,
244    pub inline_docs: u64,
245}
246
247impl CompressionStats {
248    pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
249        match format {
250            PostingFormat::Bitpacked => {
251                self.bitpacked_count += 1;
252                self.bitpacked_docs += doc_count as u64;
253            }
254            PostingFormat::EliasFano => {
255                self.elias_fano_count += 1;
256                self.elias_fano_docs += doc_count as u64;
257            }
258            PostingFormat::Roaring => {
259                self.roaring_count += 1;
260                self.roaring_docs += doc_count as u64;
261            }
262        }
263    }
264
265    pub fn record_inline(&mut self, doc_count: u32) {
266        self.inline_count += 1;
267        self.inline_docs += doc_count as u64;
268    }
269
270    pub fn total_terms(&self) -> u32 {
271        self.bitpacked_count + self.elias_fano_count + self.roaring_count + self.inline_count
272    }
273
274    pub fn total_postings(&self) -> u64 {
275        self.bitpacked_docs + self.elias_fano_docs + self.roaring_docs + self.inline_docs
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_format_selection() {
285        // Small list -> Bitpacked
286        assert_eq!(
287            PostingFormat::select(100, 1_000_000),
288            PostingFormat::Bitpacked
289        );
290
291        // Medium list -> Bitpacked
292        assert_eq!(
293            PostingFormat::select(5_000, 1_000_000),
294            PostingFormat::Bitpacked
295        );
296
297        // Long list but not frequent enough -> Elias-Fano
298        // 15K / 1M = 1.5% which is > 1% threshold, so it's Roaring
299        // Use a larger corpus to get Elias-Fano
300        assert_eq!(
301            PostingFormat::select(15_000, 10_000_000),
302            PostingFormat::EliasFano
303        );
304
305        // Very frequent term (>1% of corpus AND >10K docs) -> Roaring
306        assert_eq!(
307            PostingFormat::select(50_000, 1_000_000),
308            PostingFormat::Roaring
309        );
310    }
311
312    #[test]
313    fn test_compressed_posting_list_bitpacked() {
314        let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
315        let term_freqs: Vec<u32> = vec![1; 100];
316
317        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
318
319        assert_eq!(list.format(), PostingFormat::Bitpacked);
320        assert_eq!(list.doc_count(), 100);
321
322        let mut iter = list.iterator();
323        for (i, &expected) in doc_ids.iter().enumerate() {
324            assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
325            iter.advance();
326        }
327    }
328
329    #[test]
330    fn test_compressed_posting_list_elias_fano() {
331        let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
332        let term_freqs: Vec<u32> = vec![1; 15_000];
333
334        // Use large corpus so 15K docs is < 1% (Elias-Fano, not Roaring)
335        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
336
337        assert_eq!(list.format(), PostingFormat::EliasFano);
338        assert_eq!(list.doc_count(), 15_000);
339    }
340
341    #[test]
342    fn test_compressed_posting_list_serialization() {
343        let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
344        let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
345
346        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
347
348        let mut buffer = Vec::new();
349        list.serialize(&mut buffer).unwrap();
350
351        let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
352
353        assert_eq!(restored.format(), list.format());
354        assert_eq!(restored.doc_count(), list.doc_count());
355
356        // Verify iteration
357        let mut iter1 = list.iterator();
358        let mut iter2 = restored.iterator();
359
360        while iter1.doc() != u32::MAX {
361            assert_eq!(iter1.doc(), iter2.doc());
362            assert_eq!(iter1.term_freq(), iter2.term_freq());
363            iter1.advance();
364            iter2.advance();
365        }
366    }
367
368    #[test]
369    fn test_iterator_seek() {
370        let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
371        let term_freqs: Vec<u32> = vec![1; 8];
372
373        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
374        let mut iter = list.iterator();
375
376        assert_eq!(iter.seek(25), 30);
377        assert_eq!(iter.seek(100), 100);
378        assert_eq!(iter.seek(500), 1000);
379        assert_eq!(iter.seek(3000), u32::MAX);
380    }
381}