1use byteorder::{ReadBytesExt, WriteBytesExt};
13use std::io::{self, Read, Write};
14
15use super::elias_fano::EliasFanoPostingList;
16use super::horizontal_bp128::HorizontalBP128PostingList;
17use super::opt_p4d::OptP4DPostingList;
18use super::partitioned_ef::PartitionedEFPostingList;
19use super::roaring::RoaringPostingList;
20use super::vertical_bp128::VerticalBP128PostingList;
21
22pub const INLINE_THRESHOLD: usize = 3;
24pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; pub const PARTITIONED_EF_THRESHOLD: usize = 20_000;
27
28const FORMAT_HORIZONTAL_BP128: u8 = 0;
30const FORMAT_ELIAS_FANO: u8 = 1;
31const FORMAT_ROARING: u8 = 2;
32const FORMAT_VERTICAL_BP128: u8 = 3;
33const FORMAT_PARTITIONED_EF: u8 = 4;
34const FORMAT_OPT_P4D: u8 = 5;
35
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum PostingFormat {
39 HorizontalBP128,
41 VerticalBP128,
43 EliasFano,
45 PartitionedEF,
47 Roaring,
49 OptP4D,
51}
52
53impl PostingFormat {
54 pub fn select(doc_count: usize, total_docs: usize) -> Self {
66 let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
67
68 if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count >= PARTITIONED_EF_THRESHOLD {
69 PostingFormat::Roaring
71 } else if doc_count >= PARTITIONED_EF_THRESHOLD {
72 PostingFormat::PartitionedEF
74 } else {
75 PostingFormat::HorizontalBP128
77 }
78 }
79}
80
81#[derive(Debug, Clone)]
83pub enum CompressedPostingList {
84 HorizontalBP128(HorizontalBP128PostingList),
85 VerticalBP128(VerticalBP128PostingList),
86 EliasFano(EliasFanoPostingList),
87 PartitionedEF(PartitionedEFPostingList),
88 Roaring(RoaringPostingList),
89 OptP4D(OptP4DPostingList),
90}
91
92impl CompressedPostingList {
93 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
95 let format = PostingFormat::select(doc_ids.len(), total_docs);
96
97 match format {
98 PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
99 HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
100 ),
101 PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
102 VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
103 ),
104 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
105 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
106 ),
107 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
108 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
109 ),
110 PostingFormat::Roaring => CompressedPostingList::Roaring(
111 RoaringPostingList::from_postings(doc_ids, term_freqs),
112 ),
113 PostingFormat::OptP4D => CompressedPostingList::OptP4D(
114 OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
115 ),
116 }
117 }
118
119 pub fn from_postings_with_format(
121 doc_ids: &[u32],
122 term_freqs: &[u32],
123 format: PostingFormat,
124 idf: f32,
125 ) -> Self {
126 match format {
127 PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
128 HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
129 ),
130 PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
131 VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
132 ),
133 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
134 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
135 ),
136 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
137 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
138 ),
139 PostingFormat::Roaring => CompressedPostingList::Roaring(
140 RoaringPostingList::from_postings(doc_ids, term_freqs),
141 ),
142 PostingFormat::OptP4D => CompressedPostingList::OptP4D(
143 OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
144 ),
145 }
146 }
147
148 pub fn doc_count(&self) -> u32 {
150 match self {
151 CompressedPostingList::HorizontalBP128(p) => p.doc_count,
152 CompressedPostingList::VerticalBP128(p) => p.doc_count,
153 CompressedPostingList::EliasFano(p) => p.len(),
154 CompressedPostingList::PartitionedEF(p) => p.len(),
155 CompressedPostingList::Roaring(p) => p.len(),
156 CompressedPostingList::OptP4D(p) => p.len(),
157 }
158 }
159
160 pub fn max_tf(&self) -> u32 {
162 match self {
163 CompressedPostingList::HorizontalBP128(p) => p.max_score as u32, CompressedPostingList::VerticalBP128(p) => {
165 p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
166 }
167 CompressedPostingList::EliasFano(p) => p.max_tf,
168 CompressedPostingList::PartitionedEF(p) => p.max_tf,
169 CompressedPostingList::Roaring(p) => p.max_tf,
170 CompressedPostingList::OptP4D(p) => {
171 p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
172 }
173 }
174 }
175
176 pub fn format(&self) -> PostingFormat {
178 match self {
179 CompressedPostingList::HorizontalBP128(_) => PostingFormat::HorizontalBP128,
180 CompressedPostingList::VerticalBP128(_) => PostingFormat::VerticalBP128,
181 CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
182 CompressedPostingList::PartitionedEF(_) => PostingFormat::PartitionedEF,
183 CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
184 CompressedPostingList::OptP4D(_) => PostingFormat::OptP4D,
185 }
186 }
187
188 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
190 match self {
191 CompressedPostingList::HorizontalBP128(p) => {
192 writer.write_u8(FORMAT_HORIZONTAL_BP128)?;
193 p.serialize(writer)
194 }
195 CompressedPostingList::VerticalBP128(p) => {
196 writer.write_u8(FORMAT_VERTICAL_BP128)?;
197 p.serialize(writer)
198 }
199 CompressedPostingList::EliasFano(p) => {
200 writer.write_u8(FORMAT_ELIAS_FANO)?;
201 p.serialize(writer)
202 }
203 CompressedPostingList::PartitionedEF(p) => {
204 writer.write_u8(FORMAT_PARTITIONED_EF)?;
205 p.serialize(writer)
206 }
207 CompressedPostingList::Roaring(p) => {
208 writer.write_u8(FORMAT_ROARING)?;
209 p.serialize(writer)
210 }
211 CompressedPostingList::OptP4D(p) => {
212 writer.write_u8(FORMAT_OPT_P4D)?;
213 p.serialize(writer)
214 }
215 }
216 }
217
218 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
220 let format = reader.read_u8()?;
221 match format {
222 FORMAT_HORIZONTAL_BP128 => Ok(CompressedPostingList::HorizontalBP128(
223 HorizontalBP128PostingList::deserialize(reader)?,
224 )),
225 FORMAT_VERTICAL_BP128 => Ok(CompressedPostingList::VerticalBP128(
226 VerticalBP128PostingList::deserialize(reader)?,
227 )),
228 FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
229 EliasFanoPostingList::deserialize(reader)?,
230 )),
231 FORMAT_PARTITIONED_EF => Ok(CompressedPostingList::PartitionedEF(
232 PartitionedEFPostingList::deserialize(reader)?,
233 )),
234 FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
235 RoaringPostingList::deserialize(reader)?,
236 )),
237 FORMAT_OPT_P4D => Ok(CompressedPostingList::OptP4D(
238 OptP4DPostingList::deserialize(reader)?,
239 )),
240 _ => Err(io::Error::new(
241 io::ErrorKind::InvalidData,
242 format!("Unknown posting list format: {}", format),
243 )),
244 }
245 }
246
247 pub fn iterator(&self) -> CompressedPostingIterator<'_> {
249 match self {
250 CompressedPostingList::HorizontalBP128(p) => {
251 CompressedPostingIterator::HorizontalBP128(p.iterator())
252 }
253 CompressedPostingList::VerticalBP128(p) => {
254 CompressedPostingIterator::VerticalBP128(p.iterator())
255 }
256 CompressedPostingList::EliasFano(p) => {
257 CompressedPostingIterator::EliasFano(p.iterator())
258 }
259 CompressedPostingList::PartitionedEF(p) => {
260 CompressedPostingIterator::PartitionedEF(p.iterator())
261 }
262 CompressedPostingList::Roaring(p) => {
263 let mut iter = p.iterator();
264 iter.init();
265 CompressedPostingIterator::Roaring(iter)
266 }
267 CompressedPostingList::OptP4D(p) => CompressedPostingIterator::OptP4D(p.iterator()),
268 }
269 }
270}
271
272pub enum CompressedPostingIterator<'a> {
274 HorizontalBP128(super::horizontal_bp128::HorizontalBP128Iterator<'a>),
275 VerticalBP128(super::vertical_bp128::VerticalBP128Iterator<'a>),
276 EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
277 PartitionedEF(super::partitioned_ef::PartitionedEFPostingIterator<'a>),
278 Roaring(super::roaring::RoaringPostingIterator<'a>),
279 OptP4D(super::opt_p4d::OptP4DIterator<'a>),
280}
281
282impl<'a> CompressedPostingIterator<'a> {
283 pub fn doc(&self) -> u32 {
285 match self {
286 CompressedPostingIterator::HorizontalBP128(i) => i.doc(),
287 CompressedPostingIterator::VerticalBP128(i) => i.doc(),
288 CompressedPostingIterator::EliasFano(i) => i.doc(),
289 CompressedPostingIterator::PartitionedEF(i) => i.doc(),
290 CompressedPostingIterator::Roaring(i) => i.doc(),
291 CompressedPostingIterator::OptP4D(i) => i.doc(),
292 }
293 }
294
295 pub fn term_freq(&self) -> u32 {
297 match self {
298 CompressedPostingIterator::HorizontalBP128(i) => i.term_freq(),
299 CompressedPostingIterator::VerticalBP128(i) => i.term_freq(),
300 CompressedPostingIterator::EliasFano(i) => i.term_freq(),
301 CompressedPostingIterator::PartitionedEF(i) => i.term_freq(),
302 CompressedPostingIterator::Roaring(i) => i.term_freq(),
303 CompressedPostingIterator::OptP4D(i) => i.term_freq(),
304 }
305 }
306
307 pub fn advance(&mut self) -> u32 {
309 match self {
310 CompressedPostingIterator::HorizontalBP128(i) => i.advance(),
311 CompressedPostingIterator::VerticalBP128(i) => i.advance(),
312 CompressedPostingIterator::EliasFano(i) => i.advance(),
313 CompressedPostingIterator::PartitionedEF(i) => i.advance(),
314 CompressedPostingIterator::Roaring(i) => i.advance(),
315 CompressedPostingIterator::OptP4D(i) => i.advance(),
316 }
317 }
318
319 pub fn seek(&mut self, target: u32) -> u32 {
321 match self {
322 CompressedPostingIterator::HorizontalBP128(i) => i.seek(target),
323 CompressedPostingIterator::VerticalBP128(i) => i.seek(target),
324 CompressedPostingIterator::EliasFano(i) => i.seek(target),
325 CompressedPostingIterator::PartitionedEF(i) => i.seek(target),
326 CompressedPostingIterator::Roaring(i) => i.seek(target),
327 CompressedPostingIterator::OptP4D(i) => i.seek(target),
328 }
329 }
330
331 pub fn is_exhausted(&self) -> bool {
333 self.doc() == u32::MAX
334 }
335}
336
337#[derive(Debug, Default, Clone)]
339pub struct CompressionStats {
340 pub bitpacked_count: u32,
341 pub bitpacked_docs: u64,
342 pub simd_bp128_count: u32,
343 pub simd_bp128_docs: u64,
344 pub elias_fano_count: u32,
345 pub elias_fano_docs: u64,
346 pub partitioned_ef_count: u32,
347 pub partitioned_ef_docs: u64,
348 pub roaring_count: u32,
349 pub roaring_docs: u64,
350 pub inline_count: u32,
351 pub inline_docs: u64,
352}
353
354impl CompressionStats {
355 pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
356 match format {
357 PostingFormat::HorizontalBP128 => {
358 self.bitpacked_count += 1;
359 self.bitpacked_docs += doc_count as u64;
360 }
361 PostingFormat::VerticalBP128 => {
362 self.simd_bp128_count += 1;
363 self.simd_bp128_docs += doc_count as u64;
364 }
365 PostingFormat::EliasFano => {
366 self.elias_fano_count += 1;
367 self.elias_fano_docs += doc_count as u64;
368 }
369 PostingFormat::PartitionedEF => {
370 self.partitioned_ef_count += 1;
371 self.partitioned_ef_docs += doc_count as u64;
372 }
373 PostingFormat::Roaring => {
374 self.roaring_count += 1;
375 self.roaring_docs += doc_count as u64;
376 }
377 PostingFormat::OptP4D => {
378 self.bitpacked_count += 1;
380 self.bitpacked_docs += doc_count as u64;
381 }
382 }
383 }
384
385 pub fn record_inline(&mut self, doc_count: u32) {
386 self.inline_count += 1;
387 self.inline_docs += doc_count as u64;
388 }
389
390 pub fn total_terms(&self) -> u32 {
391 self.bitpacked_count
392 + self.simd_bp128_count
393 + self.elias_fano_count
394 + self.partitioned_ef_count
395 + self.roaring_count
396 + self.inline_count
397 }
398
399 pub fn total_postings(&self) -> u64 {
400 self.bitpacked_docs
401 + self.simd_bp128_docs
402 + self.elias_fano_docs
403 + self.partitioned_ef_docs
404 + self.roaring_docs
405 + self.inline_docs
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 fn test_format_selection() {
415 assert_eq!(
417 PostingFormat::select(100, 1_000_000),
418 PostingFormat::HorizontalBP128
419 );
420
421 assert_eq!(
423 PostingFormat::select(500, 1_000_000),
424 PostingFormat::HorizontalBP128
425 );
426
427 assert_eq!(
428 PostingFormat::select(5_000, 1_000_000),
429 PostingFormat::HorizontalBP128
430 );
431
432 assert_eq!(
433 PostingFormat::select(15_000, 10_000_000),
434 PostingFormat::HorizontalBP128
435 );
436
437 assert_eq!(
439 PostingFormat::select(25_000, 10_000_000),
440 PostingFormat::PartitionedEF
441 );
442
443 assert_eq!(
445 PostingFormat::select(50_000, 1_000_000),
446 PostingFormat::Roaring
447 );
448 }
449
450 #[test]
451 fn test_compressed_posting_list_small() {
452 let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
453 let term_freqs: Vec<u32> = vec![1; 100];
454
455 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
456
457 assert_eq!(list.format(), PostingFormat::HorizontalBP128);
459 assert_eq!(list.doc_count(), 100);
460
461 let mut iter = list.iterator();
462 for (i, &expected) in doc_ids.iter().enumerate() {
463 assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
464 iter.advance();
465 }
466 }
467
468 #[test]
469 fn test_compressed_posting_list_bitpacked() {
470 let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
471 let term_freqs: Vec<u32> = vec![1; 15_000];
472
473 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
475
476 assert_eq!(list.format(), PostingFormat::HorizontalBP128);
477 assert_eq!(list.doc_count(), 15_000);
478 }
479
480 #[test]
481 fn test_compressed_posting_list_serialization() {
482 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
483 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
484
485 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
486
487 let mut buffer = Vec::new();
488 list.serialize(&mut buffer).unwrap();
489
490 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
491
492 assert_eq!(restored.format(), list.format());
493 assert_eq!(restored.doc_count(), list.doc_count());
494
495 let mut iter1 = list.iterator();
497 let mut iter2 = restored.iterator();
498
499 while iter1.doc() != u32::MAX {
500 assert_eq!(iter1.doc(), iter2.doc());
501 assert_eq!(iter1.term_freq(), iter2.term_freq());
502 iter1.advance();
503 iter2.advance();
504 }
505 }
506
507 #[test]
508 fn test_iterator_seek() {
509 let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
510 let term_freqs: Vec<u32> = vec![1; 8];
511
512 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
513 let mut iter = list.iterator();
514
515 assert_eq!(iter.seek(25), 30);
516 assert_eq!(iter.seek(100), 100);
517 assert_eq!(iter.seek(500), 1000);
518 assert_eq!(iter.seek(3000), u32::MAX);
519 }
520
521 #[test]
522 fn test_opt_p4d_via_unified_interface() {
523 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
524 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
525
526 let list = CompressedPostingList::from_postings_with_format(
528 &doc_ids,
529 &term_freqs,
530 PostingFormat::OptP4D,
531 1.0,
532 );
533
534 assert_eq!(list.format(), PostingFormat::OptP4D);
535 assert_eq!(list.doc_count(), 500);
536
537 let mut buffer = Vec::new();
539 list.serialize(&mut buffer).unwrap();
540 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
541
542 assert_eq!(restored.format(), PostingFormat::OptP4D);
543 assert_eq!(restored.doc_count(), 500);
544
545 let mut iter1 = list.iterator();
547 let mut iter2 = restored.iterator();
548
549 while iter1.doc() != u32::MAX {
550 assert_eq!(iter1.doc(), iter2.doc());
551 assert_eq!(iter1.term_freq(), iter2.term_freq());
552 iter1.advance();
553 iter2.advance();
554 }
555
556 let mut iter = list.iterator();
558 assert_eq!(iter.seek(100), 102); assert_eq!(iter.seek(500), 501); }
561}