1use byteorder::{ReadBytesExt, WriteBytesExt};
15use std::io::{self, Read, Write};
16
17use super::elias_fano::EliasFanoPostingList;
18use super::horizontal_bp128::HorizontalBP128PostingList;
19use super::opt_p4d::OptP4DPostingList;
20use super::partitioned_ef::PartitionedEFPostingList;
21use super::roaring::RoaringPostingList;
22use super::vertical_bp128::VerticalBP128PostingList;
23
24pub const INLINE_THRESHOLD: usize = 3;
26pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; pub const PARTITIONED_EF_THRESHOLD: usize = 20_000;
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
32pub enum IndexOptimization {
33 #[default]
36 Adaptive,
37 SizeOptimized,
40 PerformanceOptimized,
43}
44
45impl IndexOptimization {
46 pub fn zstd_level(&self) -> i32 {
48 match self {
49 IndexOptimization::Adaptive => 7,
50 IndexOptimization::SizeOptimized => 22,
51 IndexOptimization::PerformanceOptimized => 3,
52 }
53 }
54
55 pub fn parse(s: &str) -> Option<Self> {
57 match s.to_lowercase().as_str() {
58 "adaptive" | "balanced" | "default" => Some(IndexOptimization::Adaptive),
59 "size" | "size-optimized" | "small" | "compact" => {
60 Some(IndexOptimization::SizeOptimized)
61 }
62 "performance" | "perf" | "fast" | "speed" => {
63 Some(IndexOptimization::PerformanceOptimized)
64 }
65 _ => None,
66 }
67 }
68}
69
70const FORMAT_HORIZONTAL_BP128: u8 = 0;
72const FORMAT_ELIAS_FANO: u8 = 1;
73const FORMAT_ROARING: u8 = 2;
74const FORMAT_VERTICAL_BP128: u8 = 3;
75const FORMAT_PARTITIONED_EF: u8 = 4;
76const FORMAT_OPT_P4D: u8 = 5;
77
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
80pub enum PostingFormat {
81 HorizontalBP128,
83 VerticalBP128,
85 EliasFano,
87 PartitionedEF,
89 Roaring,
91 OptP4D,
93}
94
95impl PostingFormat {
96 pub fn select(doc_count: usize, total_docs: usize) -> Self {
103 Self::select_with_optimization(doc_count, total_docs, IndexOptimization::Adaptive)
104 }
105
106 pub fn select_with_optimization(
108 doc_count: usize,
109 total_docs: usize,
110 optimization: IndexOptimization,
111 ) -> Self {
112 let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
113
114 match optimization {
115 IndexOptimization::Adaptive => {
116 if frequency_ratio >= ROARING_THRESHOLD_RATIO
118 && doc_count >= PARTITIONED_EF_THRESHOLD
119 {
120 PostingFormat::Roaring
121 } else if doc_count >= PARTITIONED_EF_THRESHOLD {
122 PostingFormat::PartitionedEF
123 } else {
124 PostingFormat::HorizontalBP128
125 }
126 }
127 IndexOptimization::SizeOptimized => {
128 if doc_count >= 128 {
130 PostingFormat::OptP4D
131 } else {
132 PostingFormat::HorizontalBP128
133 }
134 }
135 IndexOptimization::PerformanceOptimized => {
136 if doc_count >= 64 {
138 PostingFormat::Roaring
139 } else {
140 PostingFormat::HorizontalBP128
141 }
142 }
143 }
144 }
145}
146
147#[derive(Debug, Clone)]
149pub enum CompressedPostingList {
150 HorizontalBP128(HorizontalBP128PostingList),
151 VerticalBP128(VerticalBP128PostingList),
152 EliasFano(EliasFanoPostingList),
153 PartitionedEF(PartitionedEFPostingList),
154 Roaring(RoaringPostingList),
155 OptP4D(OptP4DPostingList),
156}
157
158impl CompressedPostingList {
159 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
161 let format = PostingFormat::select(doc_ids.len(), total_docs);
162
163 match format {
164 PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
165 HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
166 ),
167 PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
168 VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
169 ),
170 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
171 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
172 ),
173 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
174 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
175 ),
176 PostingFormat::Roaring => CompressedPostingList::Roaring(
177 RoaringPostingList::from_postings(doc_ids, term_freqs),
178 ),
179 PostingFormat::OptP4D => CompressedPostingList::OptP4D(
180 OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
181 ),
182 }
183 }
184
185 pub fn from_postings_with_format(
187 doc_ids: &[u32],
188 term_freqs: &[u32],
189 format: PostingFormat,
190 idf: f32,
191 ) -> Self {
192 match format {
193 PostingFormat::HorizontalBP128 => CompressedPostingList::HorizontalBP128(
194 HorizontalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
195 ),
196 PostingFormat::VerticalBP128 => CompressedPostingList::VerticalBP128(
197 VerticalBP128PostingList::from_postings(doc_ids, term_freqs, idf),
198 ),
199 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
200 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
201 ),
202 PostingFormat::PartitionedEF => CompressedPostingList::PartitionedEF(
203 PartitionedEFPostingList::from_postings_with_idf(doc_ids, term_freqs, idf),
204 ),
205 PostingFormat::Roaring => CompressedPostingList::Roaring(
206 RoaringPostingList::from_postings(doc_ids, term_freqs),
207 ),
208 PostingFormat::OptP4D => CompressedPostingList::OptP4D(
209 OptP4DPostingList::from_postings(doc_ids, term_freqs, idf),
210 ),
211 }
212 }
213
214 pub fn doc_count(&self) -> u32 {
216 match self {
217 CompressedPostingList::HorizontalBP128(p) => p.doc_count,
218 CompressedPostingList::VerticalBP128(p) => p.doc_count,
219 CompressedPostingList::EliasFano(p) => p.len(),
220 CompressedPostingList::PartitionedEF(p) => p.len(),
221 CompressedPostingList::Roaring(p) => p.len(),
222 CompressedPostingList::OptP4D(p) => p.len(),
223 }
224 }
225
226 pub fn max_tf(&self) -> u32 {
228 match self {
229 CompressedPostingList::HorizontalBP128(p) => p.max_score as u32, CompressedPostingList::VerticalBP128(p) => {
231 p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
232 }
233 CompressedPostingList::EliasFano(p) => p.max_tf,
234 CompressedPostingList::PartitionedEF(p) => p.max_tf,
235 CompressedPostingList::Roaring(p) => p.max_tf,
236 CompressedPostingList::OptP4D(p) => {
237 p.blocks.iter().map(|b| b.max_tf).max().unwrap_or(0)
238 }
239 }
240 }
241
242 pub fn format(&self) -> PostingFormat {
244 match self {
245 CompressedPostingList::HorizontalBP128(_) => PostingFormat::HorizontalBP128,
246 CompressedPostingList::VerticalBP128(_) => PostingFormat::VerticalBP128,
247 CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
248 CompressedPostingList::PartitionedEF(_) => PostingFormat::PartitionedEF,
249 CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
250 CompressedPostingList::OptP4D(_) => PostingFormat::OptP4D,
251 }
252 }
253
254 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
256 match self {
257 CompressedPostingList::HorizontalBP128(p) => {
258 writer.write_u8(FORMAT_HORIZONTAL_BP128)?;
259 p.serialize(writer)
260 }
261 CompressedPostingList::VerticalBP128(p) => {
262 writer.write_u8(FORMAT_VERTICAL_BP128)?;
263 p.serialize(writer)
264 }
265 CompressedPostingList::EliasFano(p) => {
266 writer.write_u8(FORMAT_ELIAS_FANO)?;
267 p.serialize(writer)
268 }
269 CompressedPostingList::PartitionedEF(p) => {
270 writer.write_u8(FORMAT_PARTITIONED_EF)?;
271 p.serialize(writer)
272 }
273 CompressedPostingList::Roaring(p) => {
274 writer.write_u8(FORMAT_ROARING)?;
275 p.serialize(writer)
276 }
277 CompressedPostingList::OptP4D(p) => {
278 writer.write_u8(FORMAT_OPT_P4D)?;
279 p.serialize(writer)
280 }
281 }
282 }
283
284 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
286 let format = reader.read_u8()?;
287 match format {
288 FORMAT_HORIZONTAL_BP128 => Ok(CompressedPostingList::HorizontalBP128(
289 HorizontalBP128PostingList::deserialize(reader)?,
290 )),
291 FORMAT_VERTICAL_BP128 => Ok(CompressedPostingList::VerticalBP128(
292 VerticalBP128PostingList::deserialize(reader)?,
293 )),
294 FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
295 EliasFanoPostingList::deserialize(reader)?,
296 )),
297 FORMAT_PARTITIONED_EF => Ok(CompressedPostingList::PartitionedEF(
298 PartitionedEFPostingList::deserialize(reader)?,
299 )),
300 FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
301 RoaringPostingList::deserialize(reader)?,
302 )),
303 FORMAT_OPT_P4D => Ok(CompressedPostingList::OptP4D(
304 OptP4DPostingList::deserialize(reader)?,
305 )),
306 _ => Err(io::Error::new(
307 io::ErrorKind::InvalidData,
308 format!("Unknown posting list format: {}", format),
309 )),
310 }
311 }
312
313 pub fn iterator(&self) -> CompressedPostingIterator<'_> {
315 match self {
316 CompressedPostingList::HorizontalBP128(p) => {
317 CompressedPostingIterator::HorizontalBP128(p.iterator())
318 }
319 CompressedPostingList::VerticalBP128(p) => {
320 CompressedPostingIterator::VerticalBP128(p.iterator())
321 }
322 CompressedPostingList::EliasFano(p) => {
323 CompressedPostingIterator::EliasFano(p.iterator())
324 }
325 CompressedPostingList::PartitionedEF(p) => {
326 CompressedPostingIterator::PartitionedEF(p.iterator())
327 }
328 CompressedPostingList::Roaring(p) => {
329 let mut iter = p.iterator();
330 iter.init();
331 CompressedPostingIterator::Roaring(iter)
332 }
333 CompressedPostingList::OptP4D(p) => CompressedPostingIterator::OptP4D(p.iterator()),
334 }
335 }
336}
337
338pub enum CompressedPostingIterator<'a> {
340 HorizontalBP128(super::horizontal_bp128::HorizontalBP128Iterator<'a>),
341 VerticalBP128(super::vertical_bp128::VerticalBP128Iterator<'a>),
342 EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
343 PartitionedEF(super::partitioned_ef::PartitionedEFPostingIterator<'a>),
344 Roaring(super::roaring::RoaringPostingIterator<'a>),
345 OptP4D(super::opt_p4d::OptP4DIterator<'a>),
346}
347
348impl<'a> CompressedPostingIterator<'a> {
349 pub fn doc(&self) -> u32 {
351 match self {
352 CompressedPostingIterator::HorizontalBP128(i) => i.doc(),
353 CompressedPostingIterator::VerticalBP128(i) => i.doc(),
354 CompressedPostingIterator::EliasFano(i) => i.doc(),
355 CompressedPostingIterator::PartitionedEF(i) => i.doc(),
356 CompressedPostingIterator::Roaring(i) => i.doc(),
357 CompressedPostingIterator::OptP4D(i) => i.doc(),
358 }
359 }
360
361 pub fn term_freq(&self) -> u32 {
363 match self {
364 CompressedPostingIterator::HorizontalBP128(i) => i.term_freq(),
365 CompressedPostingIterator::VerticalBP128(i) => i.term_freq(),
366 CompressedPostingIterator::EliasFano(i) => i.term_freq(),
367 CompressedPostingIterator::PartitionedEF(i) => i.term_freq(),
368 CompressedPostingIterator::Roaring(i) => i.term_freq(),
369 CompressedPostingIterator::OptP4D(i) => i.term_freq(),
370 }
371 }
372
373 pub fn advance(&mut self) -> u32 {
375 match self {
376 CompressedPostingIterator::HorizontalBP128(i) => i.advance(),
377 CompressedPostingIterator::VerticalBP128(i) => i.advance(),
378 CompressedPostingIterator::EliasFano(i) => i.advance(),
379 CompressedPostingIterator::PartitionedEF(i) => i.advance(),
380 CompressedPostingIterator::Roaring(i) => i.advance(),
381 CompressedPostingIterator::OptP4D(i) => i.advance(),
382 }
383 }
384
385 pub fn seek(&mut self, target: u32) -> u32 {
387 match self {
388 CompressedPostingIterator::HorizontalBP128(i) => i.seek(target),
389 CompressedPostingIterator::VerticalBP128(i) => i.seek(target),
390 CompressedPostingIterator::EliasFano(i) => i.seek(target),
391 CompressedPostingIterator::PartitionedEF(i) => i.seek(target),
392 CompressedPostingIterator::Roaring(i) => i.seek(target),
393 CompressedPostingIterator::OptP4D(i) => i.seek(target),
394 }
395 }
396
397 pub fn is_exhausted(&self) -> bool {
399 self.doc() == u32::MAX
400 }
401}
402
403#[derive(Debug, Default, Clone)]
405pub struct CompressionStats {
406 pub bitpacked_count: u32,
407 pub bitpacked_docs: u64,
408 pub simd_bp128_count: u32,
409 pub simd_bp128_docs: u64,
410 pub elias_fano_count: u32,
411 pub elias_fano_docs: u64,
412 pub partitioned_ef_count: u32,
413 pub partitioned_ef_docs: u64,
414 pub roaring_count: u32,
415 pub roaring_docs: u64,
416 pub inline_count: u32,
417 pub inline_docs: u64,
418}
419
420impl CompressionStats {
421 pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
422 match format {
423 PostingFormat::HorizontalBP128 => {
424 self.bitpacked_count += 1;
425 self.bitpacked_docs += doc_count as u64;
426 }
427 PostingFormat::VerticalBP128 => {
428 self.simd_bp128_count += 1;
429 self.simd_bp128_docs += doc_count as u64;
430 }
431 PostingFormat::EliasFano => {
432 self.elias_fano_count += 1;
433 self.elias_fano_docs += doc_count as u64;
434 }
435 PostingFormat::PartitionedEF => {
436 self.partitioned_ef_count += 1;
437 self.partitioned_ef_docs += doc_count as u64;
438 }
439 PostingFormat::Roaring => {
440 self.roaring_count += 1;
441 self.roaring_docs += doc_count as u64;
442 }
443 PostingFormat::OptP4D => {
444 self.bitpacked_count += 1;
446 self.bitpacked_docs += doc_count as u64;
447 }
448 }
449 }
450
451 pub fn record_inline(&mut self, doc_count: u32) {
452 self.inline_count += 1;
453 self.inline_docs += doc_count as u64;
454 }
455
456 pub fn total_terms(&self) -> u32 {
457 self.bitpacked_count
458 + self.simd_bp128_count
459 + self.elias_fano_count
460 + self.partitioned_ef_count
461 + self.roaring_count
462 + self.inline_count
463 }
464
465 pub fn total_postings(&self) -> u64 {
466 self.bitpacked_docs
467 + self.simd_bp128_docs
468 + self.elias_fano_docs
469 + self.partitioned_ef_docs
470 + self.roaring_docs
471 + self.inline_docs
472 }
473}
474
475#[cfg(test)]
476mod tests {
477 use super::*;
478
479 #[test]
480 fn test_format_selection() {
481 assert_eq!(
483 PostingFormat::select(100, 1_000_000),
484 PostingFormat::HorizontalBP128
485 );
486
487 assert_eq!(
489 PostingFormat::select(500, 1_000_000),
490 PostingFormat::HorizontalBP128
491 );
492
493 assert_eq!(
494 PostingFormat::select(5_000, 1_000_000),
495 PostingFormat::HorizontalBP128
496 );
497
498 assert_eq!(
499 PostingFormat::select(15_000, 10_000_000),
500 PostingFormat::HorizontalBP128
501 );
502
503 assert_eq!(
505 PostingFormat::select(25_000, 10_000_000),
506 PostingFormat::PartitionedEF
507 );
508
509 assert_eq!(
511 PostingFormat::select(50_000, 1_000_000),
512 PostingFormat::Roaring
513 );
514 }
515
516 #[test]
517 fn test_compressed_posting_list_small() {
518 let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
519 let term_freqs: Vec<u32> = vec![1; 100];
520
521 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
522
523 assert_eq!(list.format(), PostingFormat::HorizontalBP128);
525 assert_eq!(list.doc_count(), 100);
526
527 let mut iter = list.iterator();
528 for (i, &expected) in doc_ids.iter().enumerate() {
529 assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
530 iter.advance();
531 }
532 }
533
534 #[test]
535 fn test_compressed_posting_list_bitpacked() {
536 let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
537 let term_freqs: Vec<u32> = vec![1; 15_000];
538
539 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
541
542 assert_eq!(list.format(), PostingFormat::HorizontalBP128);
543 assert_eq!(list.doc_count(), 15_000);
544 }
545
546 #[test]
547 fn test_compressed_posting_list_serialization() {
548 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
549 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
550
551 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
552
553 let mut buffer = Vec::new();
554 list.serialize(&mut buffer).unwrap();
555
556 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
557
558 assert_eq!(restored.format(), list.format());
559 assert_eq!(restored.doc_count(), list.doc_count());
560
561 let mut iter1 = list.iterator();
563 let mut iter2 = restored.iterator();
564
565 while iter1.doc() != u32::MAX {
566 assert_eq!(iter1.doc(), iter2.doc());
567 assert_eq!(iter1.term_freq(), iter2.term_freq());
568 iter1.advance();
569 iter2.advance();
570 }
571 }
572
573 #[test]
574 fn test_iterator_seek() {
575 let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
576 let term_freqs: Vec<u32> = vec![1; 8];
577
578 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
579 let mut iter = list.iterator();
580
581 assert_eq!(iter.seek(25), 30);
582 assert_eq!(iter.seek(100), 100);
583 assert_eq!(iter.seek(500), 1000);
584 assert_eq!(iter.seek(3000), u32::MAX);
585 }
586
587 #[test]
588 fn test_opt_p4d_via_unified_interface() {
589 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
590 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
591
592 let list = CompressedPostingList::from_postings_with_format(
594 &doc_ids,
595 &term_freqs,
596 PostingFormat::OptP4D,
597 1.0,
598 );
599
600 assert_eq!(list.format(), PostingFormat::OptP4D);
601 assert_eq!(list.doc_count(), 500);
602
603 let mut buffer = Vec::new();
605 list.serialize(&mut buffer).unwrap();
606 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
607
608 assert_eq!(restored.format(), PostingFormat::OptP4D);
609 assert_eq!(restored.doc_count(), 500);
610
611 let mut iter1 = list.iterator();
613 let mut iter2 = restored.iterator();
614
615 while iter1.doc() != u32::MAX {
616 assert_eq!(iter1.doc(), iter2.doc());
617 assert_eq!(iter1.term_freq(), iter2.term_freq());
618 iter1.advance();
619 iter2.advance();
620 }
621
622 let mut iter = list.iterator();
624 assert_eq!(iter.seek(100), 102); assert_eq!(iter.seek(500), 501); }
627}