1use byteorder::{ReadBytesExt, WriteBytesExt};
13use std::io::{self, Read, Write};
14
15use super::bitpacking::BitpackedPostingList;
16use super::elias_fano::EliasFanoPostingList;
17use super::partitioned_ef::PartitionedEFPostingList;
18use super::roaring::RoaringPostingList;
19use super::simd_bp128::SimdBp128PostingList;
20
21pub const INLINE_THRESHOLD: usize = 3;
23pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; pub const PARTITIONED_EF_THRESHOLD: usize = 20_000;
26
27const FORMAT_BITPACKED: u8 = 0;
29const FORMAT_ELIAS_FANO: u8 = 1;
30const FORMAT_ROARING: u8 = 2;
31const FORMAT_SIMD_BP128: u8 = 3;
32const FORMAT_PARTITIONED_EF: u8 = 4;
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum PostingFormat {
37 Bitpacked,
39 SimdBp128,
41 EliasFano,
43 PartitionedEF,
45 Roaring,
47}
48
49impl PostingFormat {
50 pub fn select(doc_count: usize, total_docs: usize) -> Self {
62 let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
63
64 if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count >= PARTITIONED_EF_THRESHOLD {
65 PostingFormat::Roaring
67 } else if doc_count >= PARTITIONED_EF_THRESHOLD {
68 PostingFormat::PartitionedEF
70 } else {
71 PostingFormat::SimdBp128
73 }
74 }
75}
76
77#[derive(Debug, Clone)]
79pub enum CompressedPostingList {
80 Bitpacked(BitpackedPostingList),
81 SimdBp128(SimdBp128PostingList),
82 EliasFano(EliasFanoPostingList),
83 PartitionedEF(PartitionedEFPostingList),
84 Roaring(RoaringPostingList),
85}
86
87impl CompressedPostingList {
88 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
90 let format = PostingFormat::select(doc_ids.len(), total_docs);
91
92 match format {
93 PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
94 BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
95 ),
96 PostingFormat::SimdBp128 => CompressedPostingList::SimdBp128(
97 SimdBp128PostingList::from_postings(doc_ids, term_freqs, idf),
98 ),
99 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
100 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
101 ),
102 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
103 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
104 ),
105 PostingFormat::Roaring => CompressedPostingList::Roaring(
106 RoaringPostingList::from_postings(doc_ids, term_freqs),
107 ),
108 }
109 }
110
111 pub fn from_postings_with_format(
113 doc_ids: &[u32],
114 term_freqs: &[u32],
115 format: PostingFormat,
116 idf: f32,
117 ) -> Self {
118 match format {
119 PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
120 BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
121 ),
122 PostingFormat::SimdBp128 => CompressedPostingList::SimdBp128(
123 SimdBp128PostingList::from_postings(doc_ids, term_freqs, idf),
124 ),
125 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
126 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
127 ),
128 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
129 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
130 ),
131 PostingFormat::Roaring => CompressedPostingList::Roaring(
132 RoaringPostingList::from_postings(doc_ids, term_freqs),
133 ),
134 }
135 }
136
137 pub fn doc_count(&self) -> u32 {
139 match self {
140 CompressedPostingList::Bitpacked(p) => p.doc_count,
141 CompressedPostingList::SimdBp128(p) => p.doc_count,
142 CompressedPostingList::EliasFano(p) => p.len(),
143 CompressedPostingList::PartitionedEF(p) => p.len(),
144 CompressedPostingList::Roaring(p) => p.len(),
145 }
146 }
147
148 pub fn max_tf(&self) -> u32 {
150 match self {
151 CompressedPostingList::Bitpacked(p) => p.max_score as u32, CompressedPostingList::SimdBp128(p) => {
153 p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
154 }
155 CompressedPostingList::EliasFano(p) => p.max_tf,
156 CompressedPostingList::PartitionedEF(p) => p.max_tf,
157 CompressedPostingList::Roaring(p) => p.max_tf,
158 }
159 }
160
161 pub fn format(&self) -> PostingFormat {
163 match self {
164 CompressedPostingList::Bitpacked(_) => PostingFormat::Bitpacked,
165 CompressedPostingList::SimdBp128(_) => PostingFormat::SimdBp128,
166 CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
167 CompressedPostingList::PartitionedEF(_) => PostingFormat::PartitionedEF,
168 CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
169 }
170 }
171
172 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
174 match self {
175 CompressedPostingList::Bitpacked(p) => {
176 writer.write_u8(FORMAT_BITPACKED)?;
177 p.serialize(writer)
178 }
179 CompressedPostingList::SimdBp128(p) => {
180 writer.write_u8(FORMAT_SIMD_BP128)?;
181 p.serialize(writer)
182 }
183 CompressedPostingList::EliasFano(p) => {
184 writer.write_u8(FORMAT_ELIAS_FANO)?;
185 p.serialize(writer)
186 }
187 CompressedPostingList::PartitionedEF(p) => {
188 writer.write_u8(FORMAT_PARTITIONED_EF)?;
189 p.serialize(writer)
190 }
191 CompressedPostingList::Roaring(p) => {
192 writer.write_u8(FORMAT_ROARING)?;
193 p.serialize(writer)
194 }
195 }
196 }
197
198 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
200 let format = reader.read_u8()?;
201 match format {
202 FORMAT_BITPACKED => Ok(CompressedPostingList::Bitpacked(
203 BitpackedPostingList::deserialize(reader)?,
204 )),
205 FORMAT_SIMD_BP128 => Ok(CompressedPostingList::SimdBp128(
206 SimdBp128PostingList::deserialize(reader)?,
207 )),
208 FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
209 EliasFanoPostingList::deserialize(reader)?,
210 )),
211 FORMAT_PARTITIONED_EF => Ok(CompressedPostingList::PartitionedEF(
212 PartitionedEFPostingList::deserialize(reader)?,
213 )),
214 FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
215 RoaringPostingList::deserialize(reader)?,
216 )),
217 _ => Err(io::Error::new(
218 io::ErrorKind::InvalidData,
219 format!("Unknown posting list format: {}", format),
220 )),
221 }
222 }
223
224 pub fn iterator(&self) -> CompressedPostingIterator<'_> {
226 match self {
227 CompressedPostingList::Bitpacked(p) => {
228 CompressedPostingIterator::Bitpacked(p.iterator())
229 }
230 CompressedPostingList::SimdBp128(p) => {
231 CompressedPostingIterator::SimdBp128(p.iterator())
232 }
233 CompressedPostingList::EliasFano(p) => {
234 CompressedPostingIterator::EliasFano(p.iterator())
235 }
236 CompressedPostingList::PartitionedEF(p) => {
237 CompressedPostingIterator::PartitionedEF(p.iterator())
238 }
239 CompressedPostingList::Roaring(p) => {
240 let mut iter = p.iterator();
241 iter.init();
242 CompressedPostingIterator::Roaring(iter)
243 }
244 }
245 }
246}
247
248pub enum CompressedPostingIterator<'a> {
250 Bitpacked(super::bitpacking::BitpackedPostingIterator<'a>),
251 SimdBp128(super::simd_bp128::SimdBp128Iterator<'a>),
252 EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
253 PartitionedEF(super::partitioned_ef::PartitionedEFPostingIterator<'a>),
254 Roaring(super::roaring::RoaringPostingIterator<'a>),
255}
256
257impl<'a> CompressedPostingIterator<'a> {
258 pub fn doc(&self) -> u32 {
260 match self {
261 CompressedPostingIterator::Bitpacked(i) => i.doc(),
262 CompressedPostingIterator::SimdBp128(i) => i.doc(),
263 CompressedPostingIterator::EliasFano(i) => i.doc(),
264 CompressedPostingIterator::PartitionedEF(i) => i.doc(),
265 CompressedPostingIterator::Roaring(i) => i.doc(),
266 }
267 }
268
269 pub fn term_freq(&self) -> u32 {
271 match self {
272 CompressedPostingIterator::Bitpacked(i) => i.term_freq(),
273 CompressedPostingIterator::SimdBp128(i) => i.term_freq(),
274 CompressedPostingIterator::EliasFano(i) => i.term_freq(),
275 CompressedPostingIterator::PartitionedEF(i) => i.term_freq(),
276 CompressedPostingIterator::Roaring(i) => i.term_freq(),
277 }
278 }
279
280 pub fn advance(&mut self) -> u32 {
282 match self {
283 CompressedPostingIterator::Bitpacked(i) => i.advance(),
284 CompressedPostingIterator::SimdBp128(i) => i.advance(),
285 CompressedPostingIterator::EliasFano(i) => i.advance(),
286 CompressedPostingIterator::PartitionedEF(i) => i.advance(),
287 CompressedPostingIterator::Roaring(i) => i.advance(),
288 }
289 }
290
291 pub fn seek(&mut self, target: u32) -> u32 {
293 match self {
294 CompressedPostingIterator::Bitpacked(i) => i.seek(target),
295 CompressedPostingIterator::SimdBp128(i) => i.seek(target),
296 CompressedPostingIterator::EliasFano(i) => i.seek(target),
297 CompressedPostingIterator::PartitionedEF(i) => i.seek(target),
298 CompressedPostingIterator::Roaring(i) => i.seek(target),
299 }
300 }
301
302 pub fn is_exhausted(&self) -> bool {
304 self.doc() == u32::MAX
305 }
306}
307
308#[derive(Debug, Default, Clone)]
310pub struct CompressionStats {
311 pub bitpacked_count: u32,
312 pub bitpacked_docs: u64,
313 pub simd_bp128_count: u32,
314 pub simd_bp128_docs: u64,
315 pub elias_fano_count: u32,
316 pub elias_fano_docs: u64,
317 pub partitioned_ef_count: u32,
318 pub partitioned_ef_docs: u64,
319 pub roaring_count: u32,
320 pub roaring_docs: u64,
321 pub inline_count: u32,
322 pub inline_docs: u64,
323}
324
325impl CompressionStats {
326 pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
327 match format {
328 PostingFormat::Bitpacked => {
329 self.bitpacked_count += 1;
330 self.bitpacked_docs += doc_count as u64;
331 }
332 PostingFormat::SimdBp128 => {
333 self.simd_bp128_count += 1;
334 self.simd_bp128_docs += doc_count as u64;
335 }
336 PostingFormat::EliasFano => {
337 self.elias_fano_count += 1;
338 self.elias_fano_docs += doc_count as u64;
339 }
340 PostingFormat::PartitionedEF => {
341 self.partitioned_ef_count += 1;
342 self.partitioned_ef_docs += doc_count as u64;
343 }
344 PostingFormat::Roaring => {
345 self.roaring_count += 1;
346 self.roaring_docs += doc_count as u64;
347 }
348 }
349 }
350
351 pub fn record_inline(&mut self, doc_count: u32) {
352 self.inline_count += 1;
353 self.inline_docs += doc_count as u64;
354 }
355
356 pub fn total_terms(&self) -> u32 {
357 self.bitpacked_count
358 + self.simd_bp128_count
359 + self.elias_fano_count
360 + self.partitioned_ef_count
361 + self.roaring_count
362 + self.inline_count
363 }
364
365 pub fn total_postings(&self) -> u64 {
366 self.bitpacked_docs
367 + self.simd_bp128_docs
368 + self.elias_fano_docs
369 + self.partitioned_ef_docs
370 + self.roaring_docs
371 + self.inline_docs
372 }
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378
379 #[test]
380 fn test_format_selection() {
381 assert_eq!(
383 PostingFormat::select(100, 1_000_000),
384 PostingFormat::SimdBp128
385 );
386
387 assert_eq!(
389 PostingFormat::select(500, 1_000_000),
390 PostingFormat::SimdBp128
391 );
392
393 assert_eq!(
394 PostingFormat::select(5_000, 1_000_000),
395 PostingFormat::SimdBp128
396 );
397
398 assert_eq!(
399 PostingFormat::select(15_000, 10_000_000),
400 PostingFormat::SimdBp128
401 );
402
403 assert_eq!(
405 PostingFormat::select(25_000, 10_000_000),
406 PostingFormat::PartitionedEF
407 );
408
409 assert_eq!(
411 PostingFormat::select(50_000, 1_000_000),
412 PostingFormat::Roaring
413 );
414 }
415
416 #[test]
417 fn test_compressed_posting_list_small() {
418 let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
419 let term_freqs: Vec<u32> = vec![1; 100];
420
421 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
422
423 assert_eq!(list.format(), PostingFormat::SimdBp128);
425 assert_eq!(list.doc_count(), 100);
426
427 let mut iter = list.iterator();
428 for (i, &expected) in doc_ids.iter().enumerate() {
429 assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
430 iter.advance();
431 }
432 }
433
434 #[test]
435 fn test_compressed_posting_list_simd_bp128() {
436 let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
437 let term_freqs: Vec<u32> = vec![1; 15_000];
438
439 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
441
442 assert_eq!(list.format(), PostingFormat::SimdBp128);
443 assert_eq!(list.doc_count(), 15_000);
444 }
445
446 #[test]
447 fn test_compressed_posting_list_serialization() {
448 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
449 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
450
451 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
452
453 let mut buffer = Vec::new();
454 list.serialize(&mut buffer).unwrap();
455
456 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
457
458 assert_eq!(restored.format(), list.format());
459 assert_eq!(restored.doc_count(), list.doc_count());
460
461 let mut iter1 = list.iterator();
463 let mut iter2 = restored.iterator();
464
465 while iter1.doc() != u32::MAX {
466 assert_eq!(iter1.doc(), iter2.doc());
467 assert_eq!(iter1.term_freq(), iter2.term_freq());
468 iter1.advance();
469 iter2.advance();
470 }
471 }
472
473 #[test]
474 fn test_iterator_seek() {
475 let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
476 let term_freqs: Vec<u32> = vec![1; 8];
477
478 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
479 let mut iter = list.iterator();
480
481 assert_eq!(iter.seek(25), 30);
482 assert_eq!(iter.seek(100), 100);
483 assert_eq!(iter.seek(500), 1000);
484 assert_eq!(iter.seek(3000), u32::MAX);
485 }
486}