1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18#[derive(Debug, Clone)]
20enum Container {
21 Array(Vec<u16>),
23 Bitmap(Box<[u64; 1024]>),
25 Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30 fn new_array() -> Self {
31 Container::Array(Vec::new())
32 }
33
34 #[allow(dead_code)]
35 fn new_bitmap() -> Self {
36 Container::Bitmap(Box::new([0u64; 1024]))
37 }
38
39 fn cardinality(&self) -> u32 {
40 match self {
41 Container::Array(arr) => arr.len() as u32,
42 Container::Bitmap(bm) => bm.iter().map(|w| w.count_ones()).sum(),
43 Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44 }
45 }
46
47 fn contains(&self, val: u16) -> bool {
48 match self {
49 Container::Array(arr) => arr.binary_search(&val).is_ok(),
50 Container::Bitmap(bm) => {
51 let word_idx = (val / 64) as usize;
52 let bit_idx = val % 64;
53 (bm[word_idx] >> bit_idx) & 1 == 1
54 }
55 Container::Runs(runs) => {
56 for &(start, len) in runs {
57 if val >= start && val <= start + len {
58 return true;
59 }
60 if val < start {
61 return false;
62 }
63 }
64 false
65 }
66 }
67 }
68
69 fn insert(&mut self, val: u16) -> bool {
70 match self {
71 Container::Array(arr) => {
72 match arr.binary_search(&val) {
73 Ok(_) => false, Err(pos) => {
75 arr.insert(pos, val);
76 true
77 }
78 }
79 }
80 Container::Bitmap(bm) => {
81 let word_idx = (val / 64) as usize;
82 let bit_idx = val % 64;
83 let old = bm[word_idx];
84 bm[word_idx] |= 1u64 << bit_idx;
85 old != bm[word_idx]
86 }
87 Container::Runs(runs) => {
88 let mut arr: Vec<u16> = Vec::new();
90 for &(start, len) in runs.iter() {
91 for i in 0..=len {
92 arr.push(start + i);
93 }
94 }
95 let inserted = match arr.binary_search(&val) {
96 Ok(_) => false,
97 Err(pos) => {
98 arr.insert(pos, val);
99 true
100 }
101 };
102 *self = Container::Array(arr);
103 inserted
104 }
105 }
106 }
107
108 fn optimize(&mut self) {
109 let card = self.cardinality() as usize;
110
111 match self {
112 Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
113 let mut bm = Box::new([0u64; 1024]);
115 for &val in arr.iter() {
116 let word_idx = (val / 64) as usize;
117 let bit_idx = val % 64;
118 bm[word_idx] |= 1u64 << bit_idx;
119 }
120 *self = Container::Bitmap(bm);
121 }
122 Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
123 let mut arr = Vec::with_capacity(card);
125 for (word_idx, &word) in bm.iter().enumerate() {
126 let mut w = word;
127 while w != 0 {
128 let bit_idx = w.trailing_zeros();
129 arr.push((word_idx * 64 + bit_idx as usize) as u16);
130 w &= w - 1;
131 }
132 }
133 *self = Container::Array(arr);
134 }
135 _ => {}
136 }
137
138 self.try_run_encode();
140 }
141
142 fn try_run_encode(&mut self) {
143 let arr = match self {
144 Container::Array(arr) if arr.len() >= 4 => arr,
145 _ => return,
146 };
147
148 let mut runs = Vec::new();
149 let mut i = 0;
150
151 while i < arr.len() {
152 let start = arr[i];
153 let mut end = start;
154
155 while i + 1 < arr.len() && arr[i + 1] == end + 1 {
156 end = arr[i + 1];
157 i += 1;
158 }
159
160 runs.push((start, end - start));
161 i += 1;
162 }
163
164 if runs.len() * 4 < arr.len() * 2 {
168 *self = Container::Runs(runs);
169 }
170 }
171
172 fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
173 match self {
174 Container::Array(arr) => {
175 writer.write_u8(0)?; writer.write_u16::<LittleEndian>(arr.len() as u16)?;
177 for &val in arr {
178 writer.write_u16::<LittleEndian>(val)?;
179 }
180 }
181 Container::Bitmap(bm) => {
182 writer.write_u8(1)?; for &word in bm.iter() {
184 writer.write_u64::<LittleEndian>(word)?;
185 }
186 }
187 Container::Runs(runs) => {
188 writer.write_u8(2)?; writer.write_u16::<LittleEndian>(runs.len() as u16)?;
190 for &(start, len) in runs {
191 writer.write_u16::<LittleEndian>(start)?;
192 writer.write_u16::<LittleEndian>(len)?;
193 }
194 }
195 }
196 Ok(())
197 }
198
199 fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
200 let tag = reader.read_u8()?;
201 match tag {
202 0 => {
203 let len = reader.read_u16::<LittleEndian>()? as usize;
204 let mut arr = Vec::with_capacity(len);
205 for _ in 0..len {
206 arr.push(reader.read_u16::<LittleEndian>()?);
207 }
208 Ok(Container::Array(arr))
209 }
210 1 => {
211 let mut bm = Box::new([0u64; 1024]);
212 for word in bm.iter_mut() {
213 *word = reader.read_u64::<LittleEndian>()?;
214 }
215 Ok(Container::Bitmap(bm))
216 }
217 2 => {
218 let len = reader.read_u16::<LittleEndian>()? as usize;
219 let mut runs = Vec::with_capacity(len);
220 for _ in 0..len {
221 let start = reader.read_u16::<LittleEndian>()?;
222 let run_len = reader.read_u16::<LittleEndian>()?;
223 runs.push((start, run_len));
224 }
225 Ok(Container::Runs(runs))
226 }
227 _ => Err(io::Error::new(
228 io::ErrorKind::InvalidData,
229 "Invalid container type",
230 )),
231 }
232 }
233
234 fn size_bytes(&self) -> usize {
235 match self {
236 Container::Array(arr) => arr.len() * 2 + 4,
237 Container::Bitmap(_) => 8 * 1024 + 1,
238 Container::Runs(runs) => runs.len() * 4 + 4,
239 }
240 }
241}
242
243#[derive(Debug, Clone)]
245pub struct RoaringBitmap {
246 containers: Vec<(u16, Container)>,
248}
249
250impl RoaringBitmap {
251 pub fn new() -> Self {
253 Self {
254 containers: Vec::new(),
255 }
256 }
257
258 pub fn from_sorted_slice(values: &[u32]) -> Self {
260 let mut bitmap = Self::new();
261
262 if values.is_empty() {
263 return bitmap;
264 }
265
266 let mut current_high = (values[0] >> 16) as u16;
267 let mut current_container = Container::new_array();
268
269 for &val in values {
270 let high = (val >> 16) as u16;
271 let low = val as u16;
272
273 if high != current_high {
274 current_container.optimize();
275 bitmap.containers.push((current_high, current_container));
276 current_high = high;
277 current_container = Container::new_array();
278 }
279
280 current_container.insert(low);
281 }
282
283 current_container.optimize();
284 bitmap.containers.push((current_high, current_container));
285
286 bitmap
287 }
288
289 pub fn insert(&mut self, val: u32) -> bool {
291 let high = (val >> 16) as u16;
292 let low = val as u16;
293
294 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
296 Ok(idx) => self.containers[idx].1.insert(low),
297 Err(idx) => {
298 let mut container = Container::new_array();
299 container.insert(low);
300 self.containers.insert(idx, (high, container));
301 true
302 }
303 }
304 }
305
306 pub fn contains(&self, val: u32) -> bool {
308 let high = (val >> 16) as u16;
309 let low = val as u16;
310
311 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
312 Ok(idx) => self.containers[idx].1.contains(low),
313 Err(_) => false,
314 }
315 }
316
317 pub fn cardinality(&self) -> u32 {
319 self.containers.iter().map(|(_, c)| c.cardinality()).sum()
320 }
321
322 pub fn is_empty(&self) -> bool {
324 self.containers.is_empty()
325 }
326
327 pub fn optimize(&mut self) {
329 for (_, container) in &mut self.containers {
330 container.optimize();
331 }
332 }
333
334 pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
336 let mut result = RoaringBitmap::new();
337
338 let mut i = 0;
339 let mut j = 0;
340
341 while i < self.containers.len() && j < other.containers.len() {
342 let (high1, c1) = &self.containers[i];
343 let (high2, c2) = &other.containers[j];
344
345 match high1.cmp(high2) {
346 std::cmp::Ordering::Less => i += 1,
347 std::cmp::Ordering::Greater => j += 1,
348 std::cmp::Ordering::Equal => {
349 let intersected = Self::intersect_containers(c1, c2);
350 if intersected.cardinality() > 0 {
351 result.containers.push((*high1, intersected));
352 }
353 i += 1;
354 j += 1;
355 }
356 }
357 }
358
359 result
360 }
361
362 fn intersect_containers(c1: &Container, c2: &Container) -> Container {
363 match (c1, c2) {
364 (Container::Array(a1), Container::Array(a2)) => {
365 let mut result = Vec::new();
366 let mut i = 0;
367 let mut j = 0;
368 while i < a1.len() && j < a2.len() {
369 match a1[i].cmp(&a2[j]) {
370 std::cmp::Ordering::Less => i += 1,
371 std::cmp::Ordering::Greater => j += 1,
372 std::cmp::Ordering::Equal => {
373 result.push(a1[i]);
374 i += 1;
375 j += 1;
376 }
377 }
378 }
379 Container::Array(result)
380 }
381 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
382 let mut result = Box::new([0u64; 1024]);
383 for i in 0..1024 {
384 result[i] = b1[i] & b2[i];
385 }
386 let mut c = Container::Bitmap(result);
387 c.optimize();
388 c
389 }
390 (Container::Array(arr), Container::Bitmap(bm))
391 | (Container::Bitmap(bm), Container::Array(arr)) => {
392 let mut result = Vec::new();
393 for &val in arr {
394 let word_idx = (val / 64) as usize;
395 let bit_idx = val % 64;
396 if (bm[word_idx] >> bit_idx) & 1 == 1 {
397 result.push(val);
398 }
399 }
400 Container::Array(result)
401 }
402 _ => {
403 let arr1 = Self::container_to_array(c1);
405 let arr2 = Self::container_to_array(c2);
406 Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
407 }
408 }
409 }
410
411 fn container_to_array(c: &Container) -> Vec<u16> {
412 match c {
413 Container::Array(arr) => arr.clone(),
414 Container::Bitmap(bm) => {
415 let mut arr = Vec::new();
416 for (word_idx, &word) in bm.iter().enumerate() {
417 let mut w = word;
418 while w != 0 {
419 let bit_idx = w.trailing_zeros();
420 arr.push((word_idx * 64 + bit_idx as usize) as u16);
421 w &= w - 1;
422 }
423 }
424 arr
425 }
426 Container::Runs(runs) => {
427 let mut arr = Vec::new();
428 for &(start, len) in runs {
429 for i in 0..=len {
430 arr.push(start + i);
431 }
432 }
433 arr
434 }
435 }
436 }
437
438 pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
440 let mut result = RoaringBitmap::new();
441
442 let mut i = 0;
443 let mut j = 0;
444
445 while i < self.containers.len() || j < other.containers.len() {
446 if i >= self.containers.len() {
447 result.containers.push(other.containers[j].clone());
448 j += 1;
449 } else if j >= other.containers.len() {
450 result.containers.push(self.containers[i].clone());
451 i += 1;
452 } else {
453 let (high1, c1) = &self.containers[i];
454 let (high2, c2) = &other.containers[j];
455
456 match high1.cmp(high2) {
457 std::cmp::Ordering::Less => {
458 result.containers.push(self.containers[i].clone());
459 i += 1;
460 }
461 std::cmp::Ordering::Greater => {
462 result.containers.push(other.containers[j].clone());
463 j += 1;
464 }
465 std::cmp::Ordering::Equal => {
466 let united = Self::union_containers(c1, c2);
467 result.containers.push((*high1, united));
468 i += 1;
469 j += 1;
470 }
471 }
472 }
473 }
474
475 result
476 }
477
478 fn union_containers(c1: &Container, c2: &Container) -> Container {
479 match (c1, c2) {
480 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
481 let mut result = Box::new([0u64; 1024]);
482 for i in 0..1024 {
483 result[i] = b1[i] | b2[i];
484 }
485 Container::Bitmap(result)
486 }
487 _ => {
488 let arr1 = Self::container_to_array(c1);
490 let arr2 = Self::container_to_array(c2);
491
492 let mut result = Vec::with_capacity(arr1.len() + arr2.len());
493 let mut i = 0;
494 let mut j = 0;
495
496 while i < arr1.len() && j < arr2.len() {
497 match arr1[i].cmp(&arr2[j]) {
498 std::cmp::Ordering::Less => {
499 result.push(arr1[i]);
500 i += 1;
501 }
502 std::cmp::Ordering::Greater => {
503 result.push(arr2[j]);
504 j += 1;
505 }
506 std::cmp::Ordering::Equal => {
507 result.push(arr1[i]);
508 i += 1;
509 j += 1;
510 }
511 }
512 }
513
514 result.extend_from_slice(&arr1[i..]);
515 result.extend_from_slice(&arr2[j..]);
516
517 let mut c = Container::Array(result);
518 c.optimize();
519 c
520 }
521 }
522 }
523
524 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
526 writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
527 for (high, container) in &self.containers {
528 writer.write_u16::<LittleEndian>(*high)?;
529 container.serialize(writer)?;
530 }
531 Ok(())
532 }
533
534 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
536 let num_containers = reader.read_u32::<LittleEndian>()? as usize;
537 let mut containers = Vec::with_capacity(num_containers);
538
539 for _ in 0..num_containers {
540 let high = reader.read_u16::<LittleEndian>()?;
541 let container = Container::deserialize(reader)?;
542 containers.push((high, container));
543 }
544
545 Ok(Self { containers })
546 }
547
548 pub fn size_bytes(&self) -> usize {
550 4 + self
551 .containers
552 .iter()
553 .map(|(_, c)| 2 + c.size_bytes())
554 .sum::<usize>()
555 }
556
557 pub fn iter(&self) -> RoaringIterator<'_> {
559 RoaringIterator {
560 bitmap: self,
561 container_idx: 0,
562 value_idx: 0,
563 current_values: Vec::new(),
564 }
565 }
566}
567
568impl Default for RoaringBitmap {
569 fn default() -> Self {
570 Self::new()
571 }
572}
573
574pub struct RoaringIterator<'a> {
576 bitmap: &'a RoaringBitmap,
577 container_idx: usize,
578 value_idx: usize,
579 current_values: Vec<u16>,
580}
581
582impl<'a> Iterator for RoaringIterator<'a> {
583 type Item = u32;
584
585 fn next(&mut self) -> Option<Self::Item> {
586 loop {
587 if self.value_idx < self.current_values.len() {
588 let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
589 let low = self.current_values[self.value_idx] as u32;
590 self.value_idx += 1;
591 return Some((high << 16) | low);
592 }
593
594 if self.container_idx >= self.bitmap.containers.len() {
595 return None;
596 }
597
598 let (_, container) = &self.bitmap.containers[self.container_idx];
599 self.current_values = RoaringBitmap::container_to_array(container);
600 self.value_idx = 0;
601 self.container_idx += 1;
602 }
603 }
604}
605
606pub const ROARING_BLOCK_SIZE: usize = 65536;
608
609#[derive(Debug, Clone)]
611pub struct RoaringBlockInfo {
612 pub container_key: u16,
614 pub first_doc_id: u32,
616 pub last_doc_id: u32,
618 pub max_tf: u32,
620 pub max_block_score: f32,
622 pub num_docs: u32,
624}
625
626impl RoaringBlockInfo {
627 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
629 writer.write_u16::<LittleEndian>(self.container_key)?;
630 writer.write_u32::<LittleEndian>(self.first_doc_id)?;
631 writer.write_u32::<LittleEndian>(self.last_doc_id)?;
632 writer.write_u32::<LittleEndian>(self.max_tf)?;
633 writer.write_f32::<LittleEndian>(self.max_block_score)?;
634 writer.write_u32::<LittleEndian>(self.num_docs)?;
635 Ok(())
636 }
637
638 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
640 Ok(Self {
641 container_key: reader.read_u16::<LittleEndian>()?,
642 first_doc_id: reader.read_u32::<LittleEndian>()?,
643 last_doc_id: reader.read_u32::<LittleEndian>()?,
644 max_tf: reader.read_u32::<LittleEndian>()?,
645 max_block_score: reader.read_f32::<LittleEndian>()?,
646 num_docs: reader.read_u32::<LittleEndian>()?,
647 })
648 }
649}
650
651#[derive(Debug, Clone)]
653pub struct RoaringPostingList {
654 pub doc_ids: RoaringBitmap,
656 pub term_freqs: Vec<(u32, u32)>,
659 pub default_tf: u32,
661 pub max_tf: u32,
663 pub blocks: Vec<RoaringBlockInfo>,
665 pub max_score: f32,
667}
668
669impl RoaringPostingList {
670 const K1: f32 = 1.2;
672 const B: f32 = 0.75;
673
674 #[inline]
676 pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
677 let tf = max_tf as f32;
678 let min_length_norm = 1.0 - Self::B;
680 let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
681 idf * tf_norm
682 }
683
684 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
686 Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
687 }
688
689 pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
691 assert_eq!(doc_ids.len(), term_freqs.len());
692
693 if doc_ids.is_empty() {
694 return Self {
695 doc_ids: RoaringBitmap::new(),
696 term_freqs: Vec::new(),
697 default_tf: 1,
698 max_tf: 0,
699 blocks: Vec::new(),
700 max_score: 0.0,
701 };
702 }
703
704 let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
705
706 let mut tf_counts = std::collections::HashMap::new();
708 for &tf in term_freqs {
709 *tf_counts.entry(tf).or_insert(0u32) += 1;
710 }
711 let default_tf = tf_counts
712 .iter()
713 .max_by_key(|&(_, count)| count)
714 .map(|(&tf, _)| tf)
715 .unwrap_or(1);
716
717 let exceptions: Vec<(u32, u32)> = doc_ids
719 .iter()
720 .zip(term_freqs.iter())
721 .filter(|&(_, &tf)| tf != default_tf)
722 .map(|(&doc, &tf)| (doc, tf))
723 .collect();
724
725 let max_tf = *term_freqs.iter().max().unwrap_or(&1);
726
727 let mut blocks = Vec::new();
730 let mut max_score = 0.0f32;
731 let mut i = 0;
732
733 while i < doc_ids.len() {
734 let container_key = (doc_ids[i] >> 16) as u16;
735 let block_start = i;
736
737 while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
739 i += 1;
740 }
741
742 let block_doc_ids = &doc_ids[block_start..i];
743 let block_tfs = &term_freqs[block_start..i];
744 let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
745 let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
746 max_score = max_score.max(block_score);
747
748 blocks.push(RoaringBlockInfo {
749 container_key,
750 first_doc_id: block_doc_ids[0],
751 last_doc_id: *block_doc_ids.last().unwrap(),
752 max_tf: block_max_tf,
753 max_block_score: block_score,
754 num_docs: block_doc_ids.len() as u32,
755 });
756 }
757
758 Self {
759 doc_ids: bitmap,
760 term_freqs: exceptions,
761 default_tf,
762 max_tf,
763 blocks,
764 max_score,
765 }
766 }
767
768 pub fn get_tf(&self, doc_id: u32) -> u32 {
770 match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
772 Ok(idx) => self.term_freqs[idx].1,
773 Err(_) => self.default_tf,
774 }
775 }
776
777 pub fn len(&self) -> u32 {
779 self.doc_ids.cardinality()
780 }
781
782 pub fn is_empty(&self) -> bool {
784 self.doc_ids.is_empty()
785 }
786
787 pub fn num_blocks(&self) -> usize {
789 self.blocks.len()
790 }
791
792 pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
794 let container_key = (doc_id >> 16) as u16;
795 self.blocks
796 .binary_search_by_key(&container_key, |b| b.container_key)
797 .ok()
798 }
799
800 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
802 self.doc_ids.serialize(writer)?;
803 writer.write_u32::<LittleEndian>(self.default_tf)?;
804 writer.write_u32::<LittleEndian>(self.max_tf)?;
805 writer.write_f32::<LittleEndian>(self.max_score)?;
806 writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
807 for &(doc, tf) in &self.term_freqs {
808 writer.write_u32::<LittleEndian>(doc)?;
809 writer.write_u32::<LittleEndian>(tf)?;
810 }
811
812 writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
814 for block in &self.blocks {
815 block.serialize(writer)?;
816 }
817
818 Ok(())
819 }
820
821 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
823 let doc_ids = RoaringBitmap::deserialize(reader)?;
824 let default_tf = reader.read_u32::<LittleEndian>()?;
825 let max_tf = reader.read_u32::<LittleEndian>()?;
826 let max_score = reader.read_f32::<LittleEndian>()?;
827 let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
828 let mut term_freqs = Vec::with_capacity(num_exceptions);
829 for _ in 0..num_exceptions {
830 let doc = reader.read_u32::<LittleEndian>()?;
831 let tf = reader.read_u32::<LittleEndian>()?;
832 term_freqs.push((doc, tf));
833 }
834
835 let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
837 let mut blocks = Vec::with_capacity(num_blocks);
838 for _ in 0..num_blocks {
839 blocks.push(RoaringBlockInfo::deserialize(reader)?);
840 }
841
842 Ok(Self {
843 doc_ids,
844 term_freqs,
845 default_tf,
846 max_tf,
847 blocks,
848 max_score,
849 })
850 }
851
852 pub fn iterator(&self) -> RoaringPostingIterator<'_> {
854 RoaringPostingIterator {
855 list: self,
856 doc_iter: self.doc_ids.iter(),
857 current_doc: None,
858 current_block: 0,
859 }
860 }
861}
862
863pub struct RoaringPostingIterator<'a> {
865 list: &'a RoaringPostingList,
866 doc_iter: RoaringIterator<'a>,
867 current_doc: Option<u32>,
868 current_block: usize,
869}
870
871impl<'a> RoaringPostingIterator<'a> {
872 pub fn doc(&self) -> u32 {
874 self.current_doc.unwrap_or(u32::MAX)
875 }
876
877 pub fn term_freq(&self) -> u32 {
879 match self.current_doc {
880 Some(doc) => self.list.get_tf(doc),
881 None => 0,
882 }
883 }
884
885 pub fn advance(&mut self) -> u32 {
887 self.current_doc = self.doc_iter.next();
888 if let Some(doc) = self.current_doc
890 && !self.list.blocks.is_empty()
891 {
892 let container_key = (doc >> 16) as u16;
893 while self.current_block < self.list.blocks.len()
895 && self.list.blocks[self.current_block].container_key < container_key
896 {
897 self.current_block += 1;
898 }
899 }
900 self.doc()
901 }
902
903 pub fn init(&mut self) {
905 self.current_doc = self.doc_iter.next();
906 self.current_block = 0;
907 }
908
909 pub fn seek(&mut self, target: u32) -> u32 {
911 if !self.list.blocks.is_empty() {
913 let target_container = (target >> 16) as u16;
914
915 while self.current_block < self.list.blocks.len()
917 && self.list.blocks[self.current_block].last_doc_id < target
918 {
919 self.current_block += 1;
920 }
921
922 if self.current_block >= self.list.blocks.len() {
924 self.current_doc = None;
925 return u32::MAX;
926 }
927
928 let block = &self.list.blocks[self.current_block];
930 if block.container_key > target_container
931 || (block.container_key == target_container && block.first_doc_id > self.doc())
932 {
933 while let Some(doc) = self.current_doc {
935 if doc >= block.first_doc_id {
936 break;
937 }
938 self.current_doc = self.doc_iter.next();
939 }
940 }
941 }
942
943 while let Some(doc) = self.current_doc {
945 if doc >= target {
946 return doc;
947 }
948 self.current_doc = self.doc_iter.next();
949 }
950 u32::MAX
951 }
952
953 pub fn is_exhausted(&self) -> bool {
955 self.current_doc.is_none()
956 }
957
958 pub fn current_block_max_score(&self) -> f32 {
960 if self.current_doc.is_none() || self.list.blocks.is_empty() {
961 return 0.0;
962 }
963 if self.current_block < self.list.blocks.len() {
964 self.list.blocks[self.current_block].max_block_score
965 } else {
966 0.0
967 }
968 }
969
970 pub fn current_block_max_tf(&self) -> u32 {
972 if self.current_doc.is_none() || self.list.blocks.is_empty() {
973 return 0;
974 }
975 if self.current_block < self.list.blocks.len() {
976 self.list.blocks[self.current_block].max_tf
977 } else {
978 0
979 }
980 }
981
982 pub fn max_remaining_score(&self) -> f32 {
984 if self.current_doc.is_none() || self.list.blocks.is_empty() {
985 return 0.0;
986 }
987 self.list.blocks[self.current_block..]
988 .iter()
989 .map(|b| b.max_block_score)
990 .fold(0.0f32, |a, b| a.max(b))
991 }
992
993 pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
996 if self.list.blocks.is_empty() {
997 return None;
998 }
999
1000 while self.current_block < self.list.blocks.len() {
1001 let block = &self.list.blocks[self.current_block];
1002 if block.last_doc_id >= target {
1003 while let Some(doc) = self.current_doc {
1005 if doc >= block.first_doc_id {
1006 break;
1007 }
1008 self.current_doc = self.doc_iter.next();
1009 }
1010 return Some((block.first_doc_id, block.max_block_score));
1011 }
1012 self.current_block += 1;
1013 }
1014
1015 self.current_doc = None;
1016 None
1017 }
1018}
1019
1020#[cfg(test)]
1021mod tests {
1022 use super::*;
1023
1024 #[test]
1025 fn test_roaring_basic() {
1026 let mut bitmap = RoaringBitmap::new();
1027 bitmap.insert(1);
1028 bitmap.insert(100);
1029 bitmap.insert(1000);
1030 bitmap.insert(100000);
1031
1032 assert!(bitmap.contains(1));
1033 assert!(bitmap.contains(100));
1034 assert!(bitmap.contains(1000));
1035 assert!(bitmap.contains(100000));
1036 assert!(!bitmap.contains(2));
1037 assert!(!bitmap.contains(50000));
1038
1039 assert_eq!(bitmap.cardinality(), 4);
1040 }
1041
1042 #[test]
1043 fn test_roaring_from_sorted() {
1044 let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1045 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1046
1047 assert_eq!(bitmap.cardinality(), 10000);
1048
1049 for &val in &values {
1050 assert!(bitmap.contains(val), "Missing value {}", val);
1051 }
1052 }
1053
1054 #[test]
1055 fn test_roaring_intersection() {
1056 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1057 let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1058
1059 let result = a.and(&b);
1060
1061 assert_eq!(result.cardinality(), 4);
1062 assert!(result.contains(2));
1063 assert!(result.contains(3));
1064 assert!(result.contains(200));
1065 assert!(result.contains(300));
1066 }
1067
1068 #[test]
1069 fn test_roaring_union() {
1070 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1071 let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1072
1073 let result = a.or(&b);
1074
1075 assert_eq!(result.cardinality(), 5);
1076 for i in 1..=5 {
1077 assert!(result.contains(i));
1078 }
1079 }
1080
1081 #[test]
1082 fn test_roaring_serialization() {
1083 let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1084 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1085
1086 let mut buffer = Vec::new();
1087 bitmap.serialize(&mut buffer).unwrap();
1088
1089 let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1090
1091 assert_eq!(restored.cardinality(), bitmap.cardinality());
1092 for &val in &values {
1093 assert!(restored.contains(val));
1094 }
1095 }
1096
1097 #[test]
1098 fn test_roaring_posting_list() {
1099 let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1100 let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1101
1102 let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1103
1104 assert_eq!(list.len(), 7);
1105 assert_eq!(list.default_tf, 1);
1106 assert_eq!(list.max_tf, 3);
1107
1108 assert_eq!(list.get_tf(1), 1);
1110 assert_eq!(list.get_tf(10), 2);
1111 assert_eq!(list.get_tf(100), 3);
1112 }
1113
1114 #[test]
1115 fn test_roaring_iterator() {
1116 let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1117 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1118
1119 let collected: Vec<u32> = bitmap.iter().collect();
1120 assert_eq!(collected, values);
1121 }
1122
1123 #[test]
1124 fn test_roaring_block_max() {
1125 let mut doc_ids = Vec::new();
1130 let mut term_freqs = Vec::new();
1131
1132 for i in 0..100 {
1134 doc_ids.push(i * 100);
1135 term_freqs.push(if i == 50 { 2 } else { 1 });
1136 }
1137
1138 for i in 0..100 {
1140 doc_ids.push(65536 + i * 100);
1141 term_freqs.push(if i == 25 { 5 } else { 1 });
1142 }
1143
1144 for i in 0..100 {
1146 doc_ids.push(131072 + i * 100);
1147 term_freqs.push(if i == 75 { 3 } else { 1 });
1148 }
1149
1150 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1151
1152 assert_eq!(list.num_blocks(), 3);
1154 assert_eq!(list.blocks[0].container_key, 0);
1155 assert_eq!(list.blocks[1].container_key, 1);
1156 assert_eq!(list.blocks[2].container_key, 2);
1157
1158 assert_eq!(list.blocks[0].max_tf, 2);
1159 assert_eq!(list.blocks[1].max_tf, 5);
1160 assert_eq!(list.blocks[2].max_tf, 3);
1161
1162 assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1164 assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1165
1166 assert_eq!(list.max_score, list.blocks[1].max_block_score);
1168
1169 let mut iter = list.iterator();
1171 iter.init();
1172 assert_eq!(iter.current_block_max_tf(), 2); iter.seek(65536);
1176 assert_eq!(iter.current_block_max_tf(), 5);
1177
1178 iter.seek(131072);
1180 assert_eq!(iter.current_block_max_tf(), 3);
1181 }
1182
1183 #[test]
1184 fn test_roaring_block_max_serialization() {
1185 let mut doc_ids = Vec::new();
1186 let mut term_freqs = Vec::new();
1187
1188 for i in 0..50 {
1190 doc_ids.push(i * 10);
1191 term_freqs.push((i % 5) + 1);
1192 }
1193 for i in 0..50 {
1194 doc_ids.push(65536 + i * 10);
1195 term_freqs.push((i % 3) + 1);
1196 }
1197
1198 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1199
1200 let mut buffer = Vec::new();
1201 list.serialize(&mut buffer).unwrap();
1202
1203 let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1204
1205 assert_eq!(restored.len(), list.len());
1206 assert_eq!(restored.max_tf, list.max_tf);
1207 assert_eq!(restored.max_score, list.max_score);
1208 assert_eq!(restored.num_blocks(), list.num_blocks());
1209
1210 for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1212 assert_eq!(orig.container_key, rest.container_key);
1213 assert_eq!(orig.first_doc_id, rest.first_doc_id);
1214 assert_eq!(orig.last_doc_id, rest.last_doc_id);
1215 assert_eq!(orig.max_tf, rest.max_tf);
1216 assert_eq!(orig.max_block_score, rest.max_block_score);
1217 }
1218
1219 let mut iter1 = list.iterator();
1221 let mut iter2 = restored.iterator();
1222 iter1.init();
1223 iter2.init();
1224
1225 while !iter1.is_exhausted() {
1226 assert_eq!(iter1.doc(), iter2.doc());
1227 assert_eq!(iter1.term_freq(), iter2.term_freq());
1228 iter1.advance();
1229 iter2.advance();
1230 }
1231 assert!(iter2.is_exhausted());
1232 }
1233}