hermes_core/structures/
posting_format.rs

1//! Unified posting list format with automatic compression selection
2//!
3//! Automatically selects the best compression method based on posting list characteristics:
4//! - **Inline**: 1-3 postings stored directly in TermInfo (no separate I/O)
5//! - **HorizontalBP128**: Horizontal layout with SIMD (fastest encoding + decoding)
6//! - **Partitioned EF**: Best compression for large lists (>20K)
7//! - **Roaring**: Bitmap-based for very frequent terms (>1% of corpus), fastest iteration
8//!
9//! Note: VerticalBP128 and plain Elias-Fano are kept for backwards compatibility but
10//! are no longer selected by the automatic format selection algorithm.
11
12use byteorder::{ReadBytesExt, WriteBytesExt};
13use std::io::{self, Read, Write};
14
15use super::elias_fano::EliasFanoPostingList;
16use super::horizontal_bp128::HorizontalBP128PostingList;
17use super::opt_p4d::OptP4DPostingList;
18use super::partitioned_ef::PartitionedEFPostingList;
19use super::roaring::RoaringPostingList;
20use super::vertical_bp128::VerticalBP128PostingList;
21
22/// Thresholds for format selection (based on benchmarks)
23pub const INLINE_THRESHOLD: usize = 3;
24pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; // 1% of corpus
25/// Threshold for using Partitioned EF (best compression)
26pub const PARTITIONED_EF_THRESHOLD: usize = 20_000;
27
28/// Format tag bytes for serialization
29const FORMAT_HORIZONTAL_BP128: u8 = 0;
30const FORMAT_ELIAS_FANO: u8 = 1;
31const FORMAT_ROARING: u8 = 2;
32const FORMAT_VERTICAL_BP128: u8 = 3;
33const FORMAT_PARTITIONED_EF: u8 = 4;
34const FORMAT_OPT_P4D: u8 = 5;
35
36/// Posting list format selector
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum PostingFormat {
39    /// Horizontal BP128 layout (fastest encoding + decoding with SIMD)
40    HorizontalBP128,
41    /// Vertical BP128 with bit-interleaved layout (optimal for SIMD scatter)
42    VerticalBP128,
43    /// Elias-Fano for long posting lists
44    EliasFano,
45    /// Partitioned Elias-Fano (very long lists, better compression)
46    PartitionedEF,
47    /// Roaring bitmap for very frequent terms
48    Roaring,
49    /// OptP4D (Optimized Patched Frame-of-Reference Delta) - optimal bit-width selection with exceptions
50    OptP4D,
51}
52
53impl PostingFormat {
54    /// Select optimal format based on posting list characteristics
55    ///
56    /// Simplified format selection (based on benchmarks):
57    /// - Very frequent (>1% of corpus, >20K docs): Roaring (fastest iteration, set ops)
58    /// - Large (≥20K docs): Partitioned EF (best compression 15.9%)
59    /// - All other lists: HorizontalBP128 (fastest encoding + decoding with SIMD, optimal compression)
60    ///
61    /// Note: HorizontalBP128 now has SIMD optimizations (NEON/SSE) and outperforms SIMD-BP128:
62    /// - Faster encoding (horizontal layout vs vertical bit-interleaved)
63    /// - Faster decoding with 64-bit unaligned reads (236K vs 199K ints/μs)
64    /// - Same compression ratio
65    pub fn select(doc_count: usize, total_docs: usize) -> Self {
66        let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
67
68        if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count >= PARTITIONED_EF_THRESHOLD {
69            // Very frequent term - use Roaring for fast intersections
70            PostingFormat::Roaring
71        } else if doc_count >= PARTITIONED_EF_THRESHOLD {
72            // Large posting list - use Partitioned EF for best compression
73            PostingFormat::PartitionedEF
74        } else {
75            // All other lists - use HorizontalBP128 (fastest encoding + decoding with SIMD)
76            PostingFormat::HorizontalBP128
77        }
78    }
79}
80
81/// Unified posting list that can use any compression format
82#[derive(Debug, Clone)]
83pub enum CompressedPostingList {
84    HorizontalBP128(HorizontalBP128PostingList),
85    VerticalBP128(VerticalBP128PostingList),
86    EliasFano(EliasFanoPostingList),
87    PartitionedEF(PartitionedEFPostingList),
88    Roaring(RoaringPostingList),
89    OptP4D(OptP4DPostingList),
90}
91
92impl CompressedPostingList {
93    /// Create from raw postings with automatic format selection
94    pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
95        let format = PostingFormat::select(doc_ids.len(), total_docs);
96
97        match format {
98            PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
99                HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
100            ),
101            PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
102                VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
103            ),
104            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
105                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
106            ),
107            PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
108                PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
109            ),
110            PostingFormat::Roaring => CompressedPostingList::Roaring(
111                RoaringPostingList::from_postings(doc_ids, term_freqs),
112            ),
113            PostingFormat::OptP4D => CompressedPostingList::OptP4D(
114                OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
115            ),
116        }
117    }
118
119    /// Create with explicit format selection
120    pub fn from_postings_with_format(
121        doc_ids: &[u32],
122        term_freqs: &[u32],
123        format: PostingFormat,
124        idf: f32,
125    ) -> Self {
126        match format {
127            PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
128                HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
129            ),
130            PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
131                VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
132            ),
133            PostingFormat::EliasFano => CompressedPostingList::EliasFano(
134                EliasFanoPostingList::from_postings(doc_ids, term_freqs),
135            ),
136            PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
137                PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
138            ),
139            PostingFormat::Roaring => CompressedPostingList::Roaring(
140                RoaringPostingList::from_postings(doc_ids, term_freqs),
141            ),
142            PostingFormat::OptP4D => CompressedPostingList::OptP4D(
143                OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
144            ),
145        }
146    }
147
148    /// Get document count
149    pub fn doc_count(&self) -> u32 {
150        match self {
151            CompressedPostingList::HorizontalBP128(p) => p.doc_count,
152            CompressedPostingList::VerticalBP128(p) => p.doc_count,
153            CompressedPostingList::EliasFano(p) => p.len(),
154            CompressedPostingList::PartitionedEF(p) => p.len(),
155            CompressedPostingList::Roaring(p) => p.len(),
156            CompressedPostingList::OptP4D(p) => p.len(),
157        }
158    }
159
160    /// Get maximum term frequency
161    pub fn max_tf(&self) -> u32 {
162        match self {
163            CompressedPostingList::HorizontalBP128(p) => p.max_score as u32, // Approximation
164            CompressedPostingList::VerticalBP128(p) => {
165                p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
166            }
167            CompressedPostingList::EliasFano(p) => p.max_tf,
168            CompressedPostingList::PartitionedEF(p) => p.max_tf,
169            CompressedPostingList::Roaring(p) => p.max_tf,
170            CompressedPostingList::OptP4D(p) => {
171                p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
172            }
173        }
174    }
175
176    /// Get format type
177    pub fn format(&self) -> PostingFormat {
178        match self {
179            CompressedPostingList::HorizontalBP128(_) => PostingFormat::HorizontalBP128,
180            CompressedPostingList::VerticalBP128(_) => PostingFormat::VerticalBP128,
181            CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
182            CompressedPostingList::PartitionedEF(_) => PostingFormat::PartitionedEF,
183            CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
184            CompressedPostingList::OptP4D(_) => PostingFormat::OptP4D,
185        }
186    }
187
188    /// Serialize
189    pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
190        match self {
191            CompressedPostingList::HorizontalBP128(p) => {
192                writer.write_u8(FORMAT_HORIZONTAL_BP128)?;
193                p.serialize(writer)
194            }
195            CompressedPostingList::VerticalBP128(p) => {
196                writer.write_u8(FORMAT_VERTICAL_BP128)?;
197                p.serialize(writer)
198            }
199            CompressedPostingList::EliasFano(p) => {
200                writer.write_u8(FORMAT_ELIAS_FANO)?;
201                p.serialize(writer)
202            }
203            CompressedPostingList::PartitionedEF(p) => {
204                writer.write_u8(FORMAT_PARTITIONED_EF)?;
205                p.serialize(writer)
206            }
207            CompressedPostingList::Roaring(p) => {
208                writer.write_u8(FORMAT_ROARING)?;
209                p.serialize(writer)
210            }
211            CompressedPostingList::OptP4D(p) => {
212                writer.write_u8(FORMAT_OPT_P4D)?;
213                p.serialize(writer)
214            }
215        }
216    }
217
218    /// Deserialize
219    pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
220        let format = reader.read_u8()?;
221        match format {
222            FORMAT_HORIZONTAL_BP128 => Ok(CompressedPostingList::HorizontalBP128(
223                HorizontalBP128PostingList::deserialize(reader)?,
224            )),
225            FORMAT_VERTICAL_BP128 => Ok(CompressedPostingList::VerticalBP128(
226                VerticalBP128PostingList::deserialize(reader)?,
227            )),
228            FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
229                EliasFanoPostingList::deserialize(reader)?,
230            )),
231            FORMAT_PARTITIONED_EF => Ok(CompressedPostingList::PartitionedEF(
232                PartitionedEFPostingList::deserialize(reader)?,
233            )),
234            FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
235                RoaringPostingList::deserialize(reader)?,
236            )),
237            FORMAT_OPT_P4D => Ok(CompressedPostingList::OptP4D(
238                OptP4DPostingList::deserialize(reader)?,
239            )),
240            _ => Err(io::Error::new(
241                io::ErrorKind::InvalidData,
242                format!("Unknown posting list format: {}", format),
243            )),
244        }
245    }
246
247    /// Create an iterator
248    pub fn iterator(&self) -> CompressedPostingIterator<'_> {
249        match self {
250            CompressedPostingList::HorizontalBP128(p) => {
251                CompressedPostingIterator::HorizontalBP128(p.iterator())
252            }
253            CompressedPostingList::VerticalBP128(p) => {
254                CompressedPostingIterator::VerticalBP128(p.iterator())
255            }
256            CompressedPostingList::EliasFano(p) => {
257                CompressedPostingIterator::EliasFano(p.iterator())
258            }
259            CompressedPostingList::PartitionedEF(p) => {
260                CompressedPostingIterator::PartitionedEF(p.iterator())
261            }
262            CompressedPostingList::Roaring(p) => {
263                let mut iter = p.iterator();
264                iter.init();
265                CompressedPostingIterator::Roaring(iter)
266            }
267            CompressedPostingList::OptP4D(p) => CompressedPostingIterator::OptP4D(p.iterator()),
268        }
269    }
270}
271
272/// Unified iterator over any posting list format
273pub enum CompressedPostingIterator<'a> {
274    HorizontalBP128(super::horizontal_bp128::HorizontalBP128Iterator<'a>),
275    VerticalBP128(super::vertical_bp128::VerticalBP128Iterator<'a>),
276    EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
277    PartitionedEF(super::partitioned_ef::PartitionedEFPostingIterator<'a>),
278    Roaring(super::roaring::RoaringPostingIterator<'a>),
279    OptP4D(super::opt_p4d::OptP4DIterator<'a>),
280}
281
282impl<'a> CompressedPostingIterator<'a> {
283    /// Current document ID
284    pub fn doc(&self) -> u32 {
285        match self {
286            CompressedPostingIterator::HorizontalBP128(i) => i.doc(),
287            CompressedPostingIterator::VerticalBP128(i) => i.doc(),
288            CompressedPostingIterator::EliasFano(i) => i.doc(),
289            CompressedPostingIterator::PartitionedEF(i) => i.doc(),
290            CompressedPostingIterator::Roaring(i) => i.doc(),
291            CompressedPostingIterator::OptP4D(i) => i.doc(),
292        }
293    }
294
295    /// Current term frequency
296    pub fn term_freq(&self) -> u32 {
297        match self {
298            CompressedPostingIterator::HorizontalBP128(i) => i.term_freq(),
299            CompressedPostingIterator::VerticalBP128(i) => i.term_freq(),
300            CompressedPostingIterator::EliasFano(i) => i.term_freq(),
301            CompressedPostingIterator::PartitionedEF(i) => i.term_freq(),
302            CompressedPostingIterator::Roaring(i) => i.term_freq(),
303            CompressedPostingIterator::OptP4D(i) => i.term_freq(),
304        }
305    }
306
307    /// Advance to next document
308    pub fn advance(&mut self) -> u32 {
309        match self {
310            CompressedPostingIterator::HorizontalBP128(i) => i.advance(),
311            CompressedPostingIterator::VerticalBP128(i) => i.advance(),
312            CompressedPostingIterator::EliasFano(i) => i.advance(),
313            CompressedPostingIterator::PartitionedEF(i) => i.advance(),
314            CompressedPostingIterator::Roaring(i) => i.advance(),
315            CompressedPostingIterator::OptP4D(i) => i.advance(),
316        }
317    }
318
319    /// Seek to first doc >= target
320    pub fn seek(&mut self, target: u32) -> u32 {
321        match self {
322            CompressedPostingIterator::HorizontalBP128(i) => i.seek(target),
323            CompressedPostingIterator::VerticalBP128(i) => i.seek(target),
324            CompressedPostingIterator::EliasFano(i) => i.seek(target),
325            CompressedPostingIterator::PartitionedEF(i) => i.seek(target),
326            CompressedPostingIterator::Roaring(i) => i.seek(target),
327            CompressedPostingIterator::OptP4D(i) => i.seek(target),
328        }
329    }
330
331    /// Check if exhausted
332    pub fn is_exhausted(&self) -> bool {
333        self.doc() == u32::MAX
334    }
335}
336
337/// Statistics about compression format usage
338#[derive(Debug, Default, Clone)]
339pub struct CompressionStats {
340    pub bitpacked_count: u32,
341    pub bitpacked_docs: u64,
342    pub simd_bp128_count: u32,
343    pub simd_bp128_docs: u64,
344    pub elias_fano_count: u32,
345    pub elias_fano_docs: u64,
346    pub partitioned_ef_count: u32,
347    pub partitioned_ef_docs: u64,
348    pub roaring_count: u32,
349    pub roaring_docs: u64,
350    pub inline_count: u32,
351    pub inline_docs: u64,
352}
353
354impl CompressionStats {
355    pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
356        match format {
357            PostingFormat::HorizontalBP128 => {
358                self.bitpacked_count += 1;
359                self.bitpacked_docs += doc_count as u64;
360            }
361            PostingFormat::VerticalBP128 => {
362                self.simd_bp128_count += 1;
363                self.simd_bp128_docs += doc_count as u64;
364            }
365            PostingFormat::EliasFano => {
366                self.elias_fano_count += 1;
367                self.elias_fano_docs += doc_count as u64;
368            }
369            PostingFormat::PartitionedEF => {
370                self.partitioned_ef_count += 1;
371                self.partitioned_ef_docs += doc_count as u64;
372            }
373            PostingFormat::Roaring => {
374                self.roaring_count += 1;
375                self.roaring_docs += doc_count as u64;
376            }
377            PostingFormat::OptP4D => {
378                // Track OptP4D under bitpacked for now (similar compression family)
379                self.bitpacked_count += 1;
380                self.bitpacked_docs += doc_count as u64;
381            }
382        }
383    }
384
385    pub fn record_inline(&mut self, doc_count: u32) {
386        self.inline_count += 1;
387        self.inline_docs += doc_count as u64;
388    }
389
390    pub fn total_terms(&self) -> u32 {
391        self.bitpacked_count
392            + self.simd_bp128_count
393            + self.elias_fano_count
394            + self.partitioned_ef_count
395            + self.roaring_count
396            + self.inline_count
397    }
398
399    pub fn total_postings(&self) -> u64 {
400        self.bitpacked_docs
401            + self.simd_bp128_docs
402            + self.elias_fano_docs
403            + self.partitioned_ef_docs
404            + self.roaring_docs
405            + self.inline_docs
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_format_selection() {
415        // Small list -> HorizontalBP128 (fastest encoding + decoding with SIMD)
416        assert_eq!(
417            PostingFormat::select(100, 1_000_000),
418            PostingFormat::HorizontalBP128
419        );
420
421        // Medium list -> HorizontalBP128
422        assert_eq!(
423            PostingFormat::select(500, 1_000_000),
424            PostingFormat::HorizontalBP128
425        );
426
427        assert_eq!(
428            PostingFormat::select(5_000, 1_000_000),
429            PostingFormat::HorizontalBP128
430        );
431
432        assert_eq!(
433            PostingFormat::select(15_000, 10_000_000),
434            PostingFormat::HorizontalBP128
435        );
436
437        // Large list (>=20K) -> Partitioned EF
438        assert_eq!(
439            PostingFormat::select(25_000, 10_000_000),
440            PostingFormat::PartitionedEF
441        );
442
443        // Very frequent term (>1% of corpus AND >=20K docs) -> Roaring
444        assert_eq!(
445            PostingFormat::select(50_000, 1_000_000),
446            PostingFormat::Roaring
447        );
448    }
449
450    #[test]
451    fn test_compressed_posting_list_small() {
452        let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
453        let term_freqs: Vec<u32> = vec![1; 100];
454
455        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
456
457        // Small lists use HorizontalBP128 (fastest encoding + decoding with SIMD)
458        assert_eq!(list.format(), PostingFormat::HorizontalBP128);
459        assert_eq!(list.doc_count(), 100);
460
461        let mut iter = list.iterator();
462        for (i, &expected) in doc_ids.iter().enumerate() {
463            assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
464            iter.advance();
465        }
466    }
467
468    #[test]
469    fn test_compressed_posting_list_bitpacked() {
470        let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
471        let term_freqs: Vec<u32> = vec![1; 15_000];
472
473        // 15K docs with large corpus -> HorizontalBP128 (under 20K threshold)
474        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
475
476        assert_eq!(list.format(), PostingFormat::HorizontalBP128);
477        assert_eq!(list.doc_count(), 15_000);
478    }
479
480    #[test]
481    fn test_compressed_posting_list_serialization() {
482        let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
483        let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
484
485        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
486
487        let mut buffer = Vec::new();
488        list.serialize(&mut buffer).unwrap();
489
490        let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
491
492        assert_eq!(restored.format(), list.format());
493        assert_eq!(restored.doc_count(), list.doc_count());
494
495        // Verify iteration
496        let mut iter1 = list.iterator();
497        let mut iter2 = restored.iterator();
498
499        while iter1.doc() != u32::MAX {
500            assert_eq!(iter1.doc(), iter2.doc());
501            assert_eq!(iter1.term_freq(), iter2.term_freq());
502            iter1.advance();
503            iter2.advance();
504        }
505    }
506
507    #[test]
508    fn test_iterator_seek() {
509        let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
510        let term_freqs: Vec<u32> = vec![1; 8];
511
512        let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
513        let mut iter = list.iterator();
514
515        assert_eq!(iter.seek(25), 30);
516        assert_eq!(iter.seek(100), 100);
517        assert_eq!(iter.seek(500), 1000);
518        assert_eq!(iter.seek(3000), u32::MAX);
519    }
520
521    #[test]
522    fn test_opt_p4d_via_unified_interface() {
523        let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
524        let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
525
526        // Create OptP4D explicitly via the unified interface
527        let list = CompressedPostingList::from_postings_with_format(
528            &doc_ids,
529            &term_freqs,
530            PostingFormat::OptP4D,
531            1.0,
532        );
533
534        assert_eq!(list.format(), PostingFormat::OptP4D);
535        assert_eq!(list.doc_count(), 500);
536
537        // Test serialization roundtrip
538        let mut buffer = Vec::new();
539        list.serialize(&mut buffer).unwrap();
540        let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
541
542        assert_eq!(restored.format(), PostingFormat::OptP4D);
543        assert_eq!(restored.doc_count(), 500);
544
545        // Verify iteration
546        let mut iter1 = list.iterator();
547        let mut iter2 = restored.iterator();
548
549        while iter1.doc() != u32::MAX {
550            assert_eq!(iter1.doc(), iter2.doc());
551            assert_eq!(iter1.term_freq(), iter2.term_freq());
552            iter1.advance();
553            iter2.advance();
554        }
555
556        // Test seek
557        let mut iter = list.iterator();
558        assert_eq!(iter.seek(100), 102); // 34 * 3 = 102
559        assert_eq!(iter.seek(500), 501); // 167 * 3 = 501
560    }
561}