Skip to main content

hermes_core/structures/
posting_format.rs

1//! Unified posting list format with automatic compression selection
2//!
3//! Automatically selects the best compression method based on posting list characteristics:
4//! - **Inline**: 1-3 postings stored directly in TermInfo (no separate I/O)
5//! - **SIMD-BP128**: NEON/SSE accelerated for all lists up to 20K (fastest decoding)
6//! - **Partitioned EF**: Best compression for large lists (>20K)
7//! - **Roaring**: Bitmap-based for very frequent terms (>1% of corpus), fastest iteration
8//!
9//! Note: Bitpacked and plain Elias-Fano are kept for backwards compatibility but
10//! are no longer selected by the automatic format selection algorithm.
11
12use byteorder::{ReadBytesExt, WriteBytesExt};
13use std::io::{self, Read, Write};
14
15use super::bitpacking::BitpackedPostingList;
16use super::elias_fano::EliasFanoPostingList;
17use super::partitioned_ef::PartitionedEFPostingList;
18use super::roaring::RoaringPostingList;
19use super::simd_bp128::SimdBp128PostingList;
20
21/// Thresholds for format selection (based on benchmarks)
22pub const INLINE_THRESHOLD: usize = 3;
23pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; // 1% of corpus
24/// Threshold for using Partitioned EF (best compression)
25pub const PARTITIONED_EF_THRESHOLD: usize = 20_000;
26
27/// Format tag bytes for serialization
28const FORMAT_BITPACKED: u8 = 0;
29const FORMAT_ELIAS_FANO: u8 = 1;
30const FORMAT_ROARING: u8 = 2;
31const FORMAT_SIMD_BP128: u8 = 3;
32const FORMAT_PARTITIONED_EF: u8 = 4;
33
34/// Posting list format selector
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum PostingFormat {
37    /// Standard bitpacked format (default for small lists)
38    Bitpacked,
39    /// SIMD-BP128 with vertical layout (medium lists, fast decoding)
40    SimdBp128,
41    /// Elias-Fano for long posting lists
42    EliasFano,
43    /// Partitioned Elias-Fano (very long lists, better compression)
44    PartitionedEF,
45    /// Roaring bitmap for very frequent terms
46    Roaring,
47}
48
49impl PostingFormat {
50    /// Select optimal format based on posting list characteristics
51    ///
52    /// Simplified format selection (based on benchmarks):
53    /// - Very frequent (>1% of corpus, >20K docs): Roaring (fastest iteration, set ops)
54    /// - Large (≥20K docs): Partitioned EF (best compression 15.9%)
55    /// - All other lists: SIMD-BP128 (fastest decoding 198K ints/μs, optimal compression 16.3%)
56    ///
57    /// Note: SIMD-BP128 replaces both Bitpacked and plain Elias-Fano as it has:
58    /// - Same compression as Bitpacked (16.3%)
59    /// - Faster decoding than Bitpacked (198K vs 192K ints/μs)
60    /// - Much faster than Elias-Fano (198K vs 9.6K ints/μs)
61    pub fn select(doc_count: usize, total_docs: usize) -> Self {
62        let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
63
64        if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count >= PARTITIONED_EF_THRESHOLD {
65            // Very frequent term - use Roaring for fast intersections
66            PostingFormat::Roaring
67        } else if doc_count >= PARTITIONED_EF_THRESHOLD {
68            // Large posting list - use Partitioned EF for best compression
69            PostingFormat::PartitionedEF
70        } else {
71            // All other lists - use SIMD-BP128 (fastest decoding, optimal compression)
72            PostingFormat::SimdBp128
73        }
74    }
75}
76
77/// Unified posting list that can use any compression format
78#[derive(Debug, Clone)]
79pub enum CompressedPostingList {
80    Bitpacked(BitpackedPostingList),
81    SimdBp128(SimdBp128PostingList),
82    EliasFano(EliasFanoPostingList),
83    PartitionedEF(PartitionedEFPostingList),
84    Roaring(RoaringPostingList),
85}
86
87impl CompressedPostingList {
88    /// Create from raw postings with automatic format selection
89    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
90        let format = PostingFormat::select(doc_ids.len(), total_docs);
91
92        match format {
93            PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
94                BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
95            ),
96            PostingFormat::SimdBp128 => CompressedPostingList::SimdBp128(
97                SimdBp128PostingList::from_postings(doc_ids, term_freqs, idf),
98            ),
99            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
100                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
101            ),
102            PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
103                PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
104            ),
105            PostingFormat::Roaring => CompressedPostingList::Roaring(
106                RoaringPostingList::from_postings(doc_ids, term_freqs),
107            ),
108        }
109    }
110
111    /// Create with explicit format selection
112    pub fn from_postings_with_format(
113        doc_ids: &[u32],
114        term_freqs: &[u32],
115        format: PostingFormat,
116        idf: f32,
117    ) -> Self {
118        match format {
119            PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
120                BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
121            ),
122            PostingFormat::SimdBp128 => CompressedPostingList::SimdBp128(
123                SimdBp128PostingList::from_postings(doc_ids, term_freqs, idf),
124            ),
125            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
126                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
127            ),
128            PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
129                PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
130            ),
131            PostingFormat::Roaring => CompressedPostingList::Roaring(
132                RoaringPostingList::from_postings(doc_ids, term_freqs),
133            ),
134        }
135    }
136
137    /// Get document count
138    pub fn doc_count(&self) -> u32 {
139        match self {
140            CompressedPostingList::Bitpacked(p) => p.doc_count,
141            CompressedPostingList::SimdBp128(p) => p.doc_count,
142            CompressedPostingList::EliasFano(p) => p.len(),
143            CompressedPostingList::PartitionedEF(p) => p.len(),
144            CompressedPostingList::Roaring(p) => p.len(),
145        }
146    }
147
148    /// Get maximum term frequency
149    pub fn max_tf(&self) -> u32 {
150        match self {
151            CompressedPostingList::Bitpacked(p) => p.max_score as u32, // Approximation
152            CompressedPostingList::SimdBp128(p) => {
153                p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
154            }
155            CompressedPostingList::EliasFano(p) => p.max_tf,
156            CompressedPostingList::PartitionedEF(p) => p.max_tf,
157            CompressedPostingList::Roaring(p) => p.max_tf,
158        }
159    }
160
161    /// Get format type
162    pub fn format(&self) -> PostingFormat {
163        match self {
164            CompressedPostingList::Bitpacked(_) => PostingFormat::Bitpacked,
165            CompressedPostingList::SimdBp128(_) => PostingFormat::SimdBp128,
166            CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
167            CompressedPostingList::PartitionedEF(_) => PostingFormat::PartitionedEF,
168            CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
169        }
170    }
171
172    /// Serialize
173    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
174        match self {
175            CompressedPostingList::Bitpacked(p) => {
176                writer.write_u8(FORMAT_BITPACKED)?;
177                p.serialize(writer)
178            }
179            CompressedPostingList::SimdBp128(p) => {
180                writer.write_u8(FORMAT_SIMD_BP128)?;
181                p.serialize(writer)
182            }
183            CompressedPostingList::EliasFano(p) => {
184                writer.write_u8(FORMAT_ELIAS_FANO)?;
185                p.serialize(writer)
186            }
187            CompressedPostingList::PartitionedEF(p) => {
188                writer.write_u8(FORMAT_PARTITIONED_EF)?;
189                p.serialize(writer)
190            }
191            CompressedPostingList::Roaring(p) => {
192                writer.write_u8(FORMAT_ROARING)?;
193                p.serialize(writer)
194            }
195        }
196    }
197
198    /// Deserialize
199    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
200        let format = reader.read_u8()?;
201        match format {
202            FORMAT_BITPACKED => Ok(CompressedPostingList::Bitpacked(
203                BitpackedPostingList::deserialize(reader)?,
204            )),
205            FORMAT_SIMD_BP128 => Ok(CompressedPostingList::SimdBp128(
206                SimdBp128PostingList::deserialize(reader)?,
207            )),
208            FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
209                EliasFanoPostingList::deserialize(reader)?,
210            )),
211            FORMAT_PARTITIONED_EF => Ok(CompressedPostingList::PartitionedEF(
212                PartitionedEFPostingList::deserialize(reader)?,
213            )),
214            FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
215                RoaringPostingList::deserialize(reader)?,
216            )),
217            _ => Err(io::Error::new(
218                io::ErrorKind::InvalidData,
219                format!("Unknown posting list format: {}", format),
220            )),
221        }
222    }
223
224    /// Create an iterator
225    pub fn iterator(&self) -> CompressedPostingIterator<'_> {
226        match self {
227            CompressedPostingList::Bitpacked(p) => {
228                CompressedPostingIterator::Bitpacked(p.iterator())
229            }
230            CompressedPostingList::SimdBp128(p) => {
231                CompressedPostingIterator::SimdBp128(p.iterator())
232            }
233            CompressedPostingList::EliasFano(p) => {
234                CompressedPostingIterator::EliasFano(p.iterator())
235            }
236            CompressedPostingList::PartitionedEF(p) => {
237                CompressedPostingIterator::PartitionedEF(p.iterator())
238            }
239            CompressedPostingList::Roaring(p) => {
240                let mut iter = p.iterator();
241                iter.init();
242                CompressedPostingIterator::Roaring(iter)
243            }
244        }
245    }
246}
247
248/// Unified iterator over any posting list format
249pub enum CompressedPostingIterator<'a> {
250    Bitpacked(super::bitpacking::BitpackedPostingIterator<'a>),
251    SimdBp128(super::simd_bp128::SimdBp128Iterator<'a>),
252    EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
253    PartitionedEF(super::partitioned_ef::PartitionedEFPostingIterator<'a>),
254    Roaring(super::roaring::RoaringPostingIterator<'a>),
255}
256
257impl<'a> CompressedPostingIterator<'a> {
258    /// Current document ID
259    pub fn doc(&self) -> u32 {
260        match self {
261            CompressedPostingIterator::Bitpacked(i) => i.doc(),
262            CompressedPostingIterator::SimdBp128(i) => i.doc(),
263            CompressedPostingIterator::EliasFano(i) => i.doc(),
264            CompressedPostingIterator::PartitionedEF(i) => i.doc(),
265            CompressedPostingIterator::Roaring(i) => i.doc(),
266        }
267    }
268
269    /// Current term frequency
270    pub fn term_freq(&self) -> u32 {
271        match self {
272            CompressedPostingIterator::Bitpacked(i) => i.term_freq(),
273            CompressedPostingIterator::SimdBp128(i) => i.term_freq(),
274            CompressedPostingIterator::EliasFano(i) => i.term_freq(),
275            CompressedPostingIterator::PartitionedEF(i) => i.term_freq(),
276            CompressedPostingIterator::Roaring(i) => i.term_freq(),
277        }
278    }
279
280    /// Advance to next document
281    pub fn advance(&mut self) -> u32 {
282        match self {
283            CompressedPostingIterator::Bitpacked(i) => i.advance(),
284            CompressedPostingIterator::SimdBp128(i) => i.advance(),
285            CompressedPostingIterator::EliasFano(i) => i.advance(),
286            CompressedPostingIterator::PartitionedEF(i) => i.advance(),
287            CompressedPostingIterator::Roaring(i) => i.advance(),
288        }
289    }
290
291    /// Seek to first doc >= target
292    pub fn seek(&mut self, target: u32) -> u32 {
293        match self {
294            CompressedPostingIterator::Bitpacked(i) => i.seek(target),
295            CompressedPostingIterator::SimdBp128(i) => i.seek(target),
296            CompressedPostingIterator::EliasFano(i) => i.seek(target),
297            CompressedPostingIterator::PartitionedEF(i) => i.seek(target),
298            CompressedPostingIterator::Roaring(i) => i.seek(target),
299        }
300    }
301
302    /// Check if exhausted
303    pub fn is_exhausted(&self) -> bool {
304        self.doc() == u32::MAX
305    }
306}
307
308/// Statistics about compression format usage
309#[derive(Debug, Default, Clone)]
310pub struct CompressionStats {
311    pub bitpacked_count: u32,
312    pub bitpacked_docs: u64,
313    pub simd_bp128_count: u32,
314    pub simd_bp128_docs: u64,
315    pub elias_fano_count: u32,
316    pub elias_fano_docs: u64,
317    pub partitioned_ef_count: u32,
318    pub partitioned_ef_docs: u64,
319    pub roaring_count: u32,
320    pub roaring_docs: u64,
321    pub inline_count: u32,
322    pub inline_docs: u64,
323}
324
325impl CompressionStats {
326    pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
327        match format {
328            PostingFormat::Bitpacked => {
329                self.bitpacked_count += 1;
330                self.bitpacked_docs += doc_count as u64;
331            }
332            PostingFormat::SimdBp128 => {
333                self.simd_bp128_count += 1;
334                self.simd_bp128_docs += doc_count as u64;
335            }
336            PostingFormat::EliasFano => {
337                self.elias_fano_count += 1;
338                self.elias_fano_docs += doc_count as u64;
339            }
340            PostingFormat::PartitionedEF => {
341                self.partitioned_ef_count += 1;
342                self.partitioned_ef_docs += doc_count as u64;
343            }
344            PostingFormat::Roaring => {
345                self.roaring_count += 1;
346                self.roaring_docs += doc_count as u64;
347            }
348        }
349    }
350
351    pub fn record_inline(&mut self, doc_count: u32) {
352        self.inline_count += 1;
353        self.inline_docs += doc_count as u64;
354    }
355
356    pub fn total_terms(&self) -> u32 {
357        self.bitpacked_count
358            + self.simd_bp128_count
359            + self.elias_fano_count
360            + self.partitioned_ef_count
361            + self.roaring_count
362            + self.inline_count
363    }
364
365    pub fn total_postings(&self) -> u64 {
366        self.bitpacked_docs
367            + self.simd_bp128_docs
368            + self.elias_fano_docs
369            + self.partitioned_ef_docs
370            + self.roaring_docs
371            + self.inline_docs
372    }
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378
379    #[test]
380    fn test_format_selection() {
381        // Small list -> SIMD-BP128 (replaces Bitpacked)
382        assert_eq!(
383            PostingFormat::select(100, 1_000_000),
384            PostingFormat::SimdBp128
385        );
386
387        // Medium list -> SIMD-BP128
388        assert_eq!(
389            PostingFormat::select(500, 1_000_000),
390            PostingFormat::SimdBp128
391        );
392
393        assert_eq!(
394            PostingFormat::select(5_000, 1_000_000),
395            PostingFormat::SimdBp128
396        );
397
398        assert_eq!(
399            PostingFormat::select(15_000, 10_000_000),
400            PostingFormat::SimdBp128
401        );
402
403        // Large list (>=20K) -> Partitioned EF
404        assert_eq!(
405            PostingFormat::select(25_000, 10_000_000),
406            PostingFormat::PartitionedEF
407        );
408
409        // Very frequent term (>1% of corpus AND >=20K docs) -> Roaring
410        assert_eq!(
411            PostingFormat::select(50_000, 1_000_000),
412            PostingFormat::Roaring
413        );
414    }
415
416    #[test]
417    fn test_compressed_posting_list_small() {
418        let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
419        let term_freqs: Vec<u32> = vec![1; 100];
420
421        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
422
423        // Small lists now use SIMD-BP128 (replaces Bitpacked)
424        assert_eq!(list.format(), PostingFormat::SimdBp128);
425        assert_eq!(list.doc_count(), 100);
426
427        let mut iter = list.iterator();
428        for (i, &expected) in doc_ids.iter().enumerate() {
429            assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
430            iter.advance();
431        }
432    }
433
434    #[test]
435    fn test_compressed_posting_list_simd_bp128() {
436        let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
437        let term_freqs: Vec<u32> = vec![1; 15_000];
438
439        // 15K docs with large corpus -> SIMD-BP128 (256-20K range)
440        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
441
442        assert_eq!(list.format(), PostingFormat::SimdBp128);
443        assert_eq!(list.doc_count(), 15_000);
444    }
445
446    #[test]
447    fn test_compressed_posting_list_serialization() {
448        let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
449        let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
450
451        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
452
453        let mut buffer = Vec::new();
454        list.serialize(&mut buffer).unwrap();
455
456        let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
457
458        assert_eq!(restored.format(), list.format());
459        assert_eq!(restored.doc_count(), list.doc_count());
460
461        // Verify iteration
462        let mut iter1 = list.iterator();
463        let mut iter2 = restored.iterator();
464
465        while iter1.doc() != u32::MAX {
466            assert_eq!(iter1.doc(), iter2.doc());
467            assert_eq!(iter1.term_freq(), iter2.term_freq());
468            iter1.advance();
469            iter2.advance();
470        }
471    }
472
473    #[test]
474    fn test_iterator_seek() {
475        let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
476        let term_freqs: Vec<u32> = vec![1; 8];
477
478        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
479        let mut iter = list.iterator();
480
481        assert_eq!(iter.seek(25), 30);
482        assert_eq!(iter.seek(100), 100);
483        assert_eq!(iter.seek(500), 1000);
484        assert_eq!(iter.seek(3000), u32::MAX);
485    }
486}