1use byteorder::{ReadBytesExt, WriteBytesExt};
10use std::io::{self, Read, Write};
11
12use super::bitpacking::BitpackedPostingList;
13use super::elias_fano::EliasFanoPostingList;
14use super::roaring::RoaringPostingList;
15
16pub const INLINE_THRESHOLD: usize = 3;
18pub const ELIAS_FANO_THRESHOLD: usize = 10_000;
19pub const ROARING_THRESHOLD_RATIO: f32 = 0.01; const FORMAT_BITPACKED: u8 = 0;
23const FORMAT_ELIAS_FANO: u8 = 1;
24const FORMAT_ROARING: u8 = 2;
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum PostingFormat {
29 Bitpacked,
31 EliasFano,
33 Roaring,
35}
36
37impl PostingFormat {
38 pub fn select(doc_count: usize, total_docs: usize) -> Self {
40 let frequency_ratio = doc_count as f32 / total_docs.max(1) as f32;
41
42 if frequency_ratio >= ROARING_THRESHOLD_RATIO && doc_count > ELIAS_FANO_THRESHOLD {
43 PostingFormat::Roaring
45 } else if doc_count >= ELIAS_FANO_THRESHOLD {
46 PostingFormat::EliasFano
48 } else {
49 PostingFormat::Bitpacked
51 }
52 }
53}
54
55#[derive(Debug, Clone)]
57pub enum CompressedPostingList {
58 Bitpacked(BitpackedPostingList),
59 EliasFano(EliasFanoPostingList),
60 Roaring(RoaringPostingList),
61}
62
63impl CompressedPostingList {
64 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32], total_docs: usize, idf: f32) -> Self {
66 let format = PostingFormat::select(doc_ids.len(), total_docs);
67
68 match format {
69 PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
70 BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
71 ),
72 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
73 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
74 ),
75 PostingFormat::Roaring => CompressedPostingList::Roaring(
76 RoaringPostingList::from_postings(doc_ids, term_freqs),
77 ),
78 }
79 }
80
81 pub fn from_postings_with_format(
83 doc_ids: &[u32],
84 term_freqs: &[u32],
85 format: PostingFormat,
86 idf: f32,
87 ) -> Self {
88 match format {
89 PostingFormat::Bitpacked => CompressedPostingList::Bitpacked(
90 BitpackedPostingList::from_postings(doc_ids, term_freqs, idf),
91 ),
92 PostingFormat::EliasFano => CompressedPostingList::EliasFano(
93 EliasFanoPostingList::from_postings(doc_ids, term_freqs),
94 ),
95 PostingFormat::Roaring => CompressedPostingList::Roaring(
96 RoaringPostingList::from_postings(doc_ids, term_freqs),
97 ),
98 }
99 }
100
101 pub fn doc_count(&self) -> u32 {
103 match self {
104 CompressedPostingList::Bitpacked(p) => p.doc_count,
105 CompressedPostingList::EliasFano(p) => p.len(),
106 CompressedPostingList::Roaring(p) => p.len(),
107 }
108 }
109
110 pub fn max_tf(&self) -> u32 {
112 match self {
113 CompressedPostingList::Bitpacked(p) => p.max_score as u32, CompressedPostingList::EliasFano(p) => p.max_tf,
115 CompressedPostingList::Roaring(p) => p.max_tf,
116 }
117 }
118
119 pub fn format(&self) -> PostingFormat {
121 match self {
122 CompressedPostingList::Bitpacked(_) => PostingFormat::Bitpacked,
123 CompressedPostingList::EliasFano(_) => PostingFormat::EliasFano,
124 CompressedPostingList::Roaring(_) => PostingFormat::Roaring,
125 }
126 }
127
128 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
130 match self {
131 CompressedPostingList::Bitpacked(p) => {
132 writer.write_u8(FORMAT_BITPACKED)?;
133 p.serialize(writer)
134 }
135 CompressedPostingList::EliasFano(p) => {
136 writer.write_u8(FORMAT_ELIAS_FANO)?;
137 p.serialize(writer)
138 }
139 CompressedPostingList::Roaring(p) => {
140 writer.write_u8(FORMAT_ROARING)?;
141 p.serialize(writer)
142 }
143 }
144 }
145
146 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
148 let format = reader.read_u8()?;
149 match format {
150 FORMAT_BITPACKED => Ok(CompressedPostingList::Bitpacked(
151 BitpackedPostingList::deserialize(reader)?,
152 )),
153 FORMAT_ELIAS_FANO => Ok(CompressedPostingList::EliasFano(
154 EliasFanoPostingList::deserialize(reader)?,
155 )),
156 FORMAT_ROARING => Ok(CompressedPostingList::Roaring(
157 RoaringPostingList::deserialize(reader)?,
158 )),
159 _ => Err(io::Error::new(
160 io::ErrorKind::InvalidData,
161 format!("Unknown posting list format: {}", format),
162 )),
163 }
164 }
165
166 pub fn iterator(&self) -> CompressedPostingIterator<'_> {
168 match self {
169 CompressedPostingList::Bitpacked(p) => {
170 CompressedPostingIterator::Bitpacked(p.iterator())
171 }
172 CompressedPostingList::EliasFano(p) => {
173 CompressedPostingIterator::EliasFano(p.iterator())
174 }
175 CompressedPostingList::Roaring(p) => {
176 let mut iter = p.iterator();
177 iter.init();
178 CompressedPostingIterator::Roaring(iter)
179 }
180 }
181 }
182}
183
184pub enum CompressedPostingIterator<'a> {
186 Bitpacked(super::bitpacking::BitpackedPostingIterator<'a>),
187 EliasFano(super::elias_fano::EliasFanoPostingIterator<'a>),
188 Roaring(super::roaring::RoaringPostingIterator<'a>),
189}
190
191impl<'a> CompressedPostingIterator<'a> {
192 pub fn doc(&self) -> u32 {
194 match self {
195 CompressedPostingIterator::Bitpacked(i) => i.doc(),
196 CompressedPostingIterator::EliasFano(i) => i.doc(),
197 CompressedPostingIterator::Roaring(i) => i.doc(),
198 }
199 }
200
201 pub fn term_freq(&self) -> u32 {
203 match self {
204 CompressedPostingIterator::Bitpacked(i) => i.term_freq(),
205 CompressedPostingIterator::EliasFano(i) => i.term_freq(),
206 CompressedPostingIterator::Roaring(i) => i.term_freq(),
207 }
208 }
209
210 pub fn advance(&mut self) -> u32 {
212 match self {
213 CompressedPostingIterator::Bitpacked(i) => i.advance(),
214 CompressedPostingIterator::EliasFano(i) => i.advance(),
215 CompressedPostingIterator::Roaring(i) => i.advance(),
216 }
217 }
218
219 pub fn seek(&mut self, target: u32) -> u32 {
221 match self {
222 CompressedPostingIterator::Bitpacked(i) => i.seek(target),
223 CompressedPostingIterator::EliasFano(i) => i.seek(target),
224 CompressedPostingIterator::Roaring(i) => i.seek(target),
225 }
226 }
227
228 pub fn is_exhausted(&self) -> bool {
230 self.doc() == u32::MAX
231 }
232}
233
234#[derive(Debug, Default, Clone)]
236pub struct CompressionStats {
237 pub bitpacked_count: u32,
238 pub bitpacked_docs: u64,
239 pub elias_fano_count: u32,
240 pub elias_fano_docs: u64,
241 pub roaring_count: u32,
242 pub roaring_docs: u64,
243 pub inline_count: u32,
244 pub inline_docs: u64,
245}
246
247impl CompressionStats {
248 pub fn record(&mut self, format: PostingFormat, doc_count: u32) {
249 match format {
250 PostingFormat::Bitpacked => {
251 self.bitpacked_count += 1;
252 self.bitpacked_docs += doc_count as u64;
253 }
254 PostingFormat::EliasFano => {
255 self.elias_fano_count += 1;
256 self.elias_fano_docs += doc_count as u64;
257 }
258 PostingFormat::Roaring => {
259 self.roaring_count += 1;
260 self.roaring_docs += doc_count as u64;
261 }
262 }
263 }
264
265 pub fn record_inline(&mut self, doc_count: u32) {
266 self.inline_count += 1;
267 self.inline_docs += doc_count as u64;
268 }
269
270 pub fn total_terms(&self) -> u32 {
271 self.bitpacked_count + self.elias_fano_count + self.roaring_count + self.inline_count
272 }
273
274 pub fn total_postings(&self) -> u64 {
275 self.bitpacked_docs + self.elias_fano_docs + self.roaring_docs + self.inline_docs
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 #[test]
284 fn test_format_selection() {
285 assert_eq!(
287 PostingFormat::select(100, 1_000_000),
288 PostingFormat::Bitpacked
289 );
290
291 assert_eq!(
293 PostingFormat::select(5_000, 1_000_000),
294 PostingFormat::Bitpacked
295 );
296
297 assert_eq!(
301 PostingFormat::select(15_000, 10_000_000),
302 PostingFormat::EliasFano
303 );
304
305 assert_eq!(
307 PostingFormat::select(50_000, 1_000_000),
308 PostingFormat::Roaring
309 );
310 }
311
312 #[test]
313 fn test_compressed_posting_list_bitpacked() {
314 let doc_ids: Vec<u32> = (0..100).map(|i| i * 2).collect();
315 let term_freqs: Vec<u32> = vec![1; 100];
316
317 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
318
319 assert_eq!(list.format(), PostingFormat::Bitpacked);
320 assert_eq!(list.doc_count(), 100);
321
322 let mut iter = list.iterator();
323 for (i, &expected) in doc_ids.iter().enumerate() {
324 assert_eq!(iter.doc(), expected, "Mismatch at {}", i);
325 iter.advance();
326 }
327 }
328
329 #[test]
330 fn test_compressed_posting_list_elias_fano() {
331 let doc_ids: Vec<u32> = (0..15_000).map(|i| i * 2).collect();
332 let term_freqs: Vec<u32> = vec![1; 15_000];
333
334 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 10_000_000, 1.0);
336
337 assert_eq!(list.format(), PostingFormat::EliasFano);
338 assert_eq!(list.doc_count(), 15_000);
339 }
340
341 #[test]
342 fn test_compressed_posting_list_serialization() {
343 let doc_ids: Vec<u32> = (0..500).map(|i| i * 3).collect();
344 let term_freqs: Vec<u32> = (0..500).map(|i| (i % 5) + 1).collect();
345
346 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
347
348 let mut buffer = Vec::new();
349 list.serialize(&mut buffer).unwrap();
350
351 let restored = CompressedPostingList::deserialize(&mut &buffer[..]).unwrap();
352
353 assert_eq!(restored.format(), list.format());
354 assert_eq!(restored.doc_count(), list.doc_count());
355
356 let mut iter1 = list.iterator();
358 let mut iter2 = restored.iterator();
359
360 while iter1.doc() != u32::MAX {
361 assert_eq!(iter1.doc(), iter2.doc());
362 assert_eq!(iter1.term_freq(), iter2.term_freq());
363 iter1.advance();
364 iter2.advance();
365 }
366 }
367
368 #[test]
369 fn test_iterator_seek() {
370 let doc_ids: Vec<u32> = vec![10, 20, 30, 100, 200, 300, 1000, 2000];
371 let term_freqs: Vec<u32> = vec![1; 8];
372
373 let list = CompressedPostingList::from_postings(&doc_ids, &term_freqs, 1_000_000, 1.0);
374 let mut iter = list.iterator();
375
376 assert_eq!(iter.seek(25), 30);
377 assert_eq!(iter.seek(100), 100);
378 assert_eq!(iter.seek(500), 1000);
379 assert_eq!(iter.seek(3000), u32::MAX);
380 }
381}