1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18#[derive(Debug, Clone)]
20enum Container {
21 Array(Vec<u16>),
23 Bitmap(Box<[u64; 1024]>),
25 Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30 fn new_array() -> Self {
31 Container::Array(Vec::new())
32 }
33
34 #[allow(dead_code)]
35 fn new_bitmap() -> Self {
36 Container::Bitmap(Box::new([0u64; 1024]))
37 }
38
39 fn cardinality(&self) -> u32 {
40 match self {
41 Container::Array(arr) => arr.len() as u32,
42 Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43 Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44 }
45 }
46
47 #[inline]
49 fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50 #[cfg(target_arch = "aarch64")]
51 {
52 use std::arch::aarch64::*;
53
54 let mut total = 0u32;
55 let bytes = bm.as_ptr() as *const u8;
56
57 unsafe {
59 for i in (0..8192).step_by(16) {
60 let v = vld1q_u8(bytes.add(i));
61 let cnt = vcntq_u8(v);
62 total += vaddlvq_u8(cnt) as u32;
63 }
64 }
65
66 total
67 }
68
69 #[cfg(target_arch = "x86_64")]
70 {
71 let mut total = 0u32;
73 for &word in bm.iter() {
74 total += word.count_ones();
75 }
76 total
77 }
78
79 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80 {
81 bm.iter().map(|w| w.count_ones()).sum()
82 }
83 }
84
85 fn contains(&self, val: u16) -> bool {
86 match self {
87 Container::Array(arr) => arr.binary_search(&val).is_ok(),
88 Container::Bitmap(bm) => {
89 let word_idx = (val / 64) as usize;
90 let bit_idx = val % 64;
91 (bm[word_idx] >> bit_idx) & 1 == 1
92 }
93 Container::Runs(runs) => {
94 for &(start, len) in runs {
95 if val >= start && val <= start + len {
96 return true;
97 }
98 if val < start {
99 return false;
100 }
101 }
102 false
103 }
104 }
105 }
106
107 fn insert(&mut self, val: u16) -> bool {
108 match self {
109 Container::Array(arr) => {
110 match arr.binary_search(&val) {
111 Ok(_) => false, Err(pos) => {
113 arr.insert(pos, val);
114 true
115 }
116 }
117 }
118 Container::Bitmap(bm) => {
119 let word_idx = (val / 64) as usize;
120 let bit_idx = val % 64;
121 let old = bm[word_idx];
122 bm[word_idx] |= 1u64 << bit_idx;
123 old != bm[word_idx]
124 }
125 Container::Runs(runs) => {
126 let mut arr: Vec<u16> = Vec::new();
128 for &(start, len) in runs.iter() {
129 for i in 0..=len {
130 arr.push(start + i);
131 }
132 }
133 let inserted = match arr.binary_search(&val) {
134 Ok(_) => false,
135 Err(pos) => {
136 arr.insert(pos, val);
137 true
138 }
139 };
140 *self = Container::Array(arr);
141 inserted
142 }
143 }
144 }
145
146 fn optimize(&mut self) {
147 let card = self.cardinality() as usize;
148
149 match self {
150 Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151 let mut bm = Box::new([0u64; 1024]);
153 for &val in arr.iter() {
154 let word_idx = (val / 64) as usize;
155 let bit_idx = val % 64;
156 bm[word_idx] |= 1u64 << bit_idx;
157 }
158 *self = Container::Bitmap(bm);
159 }
160 Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161 let mut arr = Vec::with_capacity(card);
163 for (word_idx, &word) in bm.iter().enumerate() {
164 let mut w = word;
165 while w != 0 {
166 let bit_idx = w.trailing_zeros();
167 arr.push((word_idx * 64 + bit_idx as usize) as u16);
168 w &= w - 1;
169 }
170 }
171 *self = Container::Array(arr);
172 }
173 _ => {}
174 }
175
176 self.try_run_encode();
178 }
179
180 fn try_run_encode(&mut self) {
181 let arr = match self {
182 Container::Array(arr) if arr.len() >= 4 => arr,
183 _ => return,
184 };
185
186 let mut runs = Vec::new();
187 let mut i = 0;
188
189 while i < arr.len() {
190 let start = arr[i];
191 let mut end = start;
192
193 while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194 end = arr[i + 1];
195 i += 1;
196 }
197
198 runs.push((start, end - start));
199 i += 1;
200 }
201
202 if runs.len() * 4 < arr.len() * 2 {
206 *self = Container::Runs(runs);
207 }
208 }
209
210 fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211 match self {
212 Container::Array(arr) => {
213 writer.write_u8(0)?; writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215 for &val in arr {
216 writer.write_u16::<LittleEndian>(val)?;
217 }
218 }
219 Container::Bitmap(bm) => {
220 writer.write_u8(1)?; for &word in bm.iter() {
222 writer.write_u64::<LittleEndian>(word)?;
223 }
224 }
225 Container::Runs(runs) => {
226 writer.write_u8(2)?; writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228 for &(start, len) in runs {
229 writer.write_u16::<LittleEndian>(start)?;
230 writer.write_u16::<LittleEndian>(len)?;
231 }
232 }
233 }
234 Ok(())
235 }
236
237 fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238 let tag = reader.read_u8()?;
239 match tag {
240 0 => {
241 let len = reader.read_u16::<LittleEndian>()? as usize;
242 let mut arr = Vec::with_capacity(len);
243 for _ in 0..len {
244 arr.push(reader.read_u16::<LittleEndian>()?);
245 }
246 Ok(Container::Array(arr))
247 }
248 1 => {
249 let mut bm = Box::new([0u64; 1024]);
250 for word in bm.iter_mut() {
251 *word = reader.read_u64::<LittleEndian>()?;
252 }
253 Ok(Container::Bitmap(bm))
254 }
255 2 => {
256 let len = reader.read_u16::<LittleEndian>()? as usize;
257 let mut runs = Vec::with_capacity(len);
258 for _ in 0..len {
259 let start = reader.read_u16::<LittleEndian>()?;
260 let run_len = reader.read_u16::<LittleEndian>()?;
261 runs.push((start, run_len));
262 }
263 Ok(Container::Runs(runs))
264 }
265 _ => Err(io::Error::new(
266 io::ErrorKind::InvalidData,
267 "Invalid container type",
268 )),
269 }
270 }
271
272 fn size_bytes(&self) -> usize {
273 match self {
274 Container::Array(arr) => arr.len() * 2 + 4,
275 Container::Bitmap(_) => 8 * 1024 + 1,
276 Container::Runs(runs) => runs.len() * 4 + 4,
277 }
278 }
279}
280
281#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284 containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289 pub fn new() -> Self {
291 Self {
292 containers: Vec::new(),
293 }
294 }
295
296 pub fn from_sorted_slice(values: &[u32]) -> Self {
298 let mut bitmap = Self::new();
299
300 if values.is_empty() {
301 return bitmap;
302 }
303
304 let mut current_high = (values[0] >> 16) as u16;
305 let mut current_container = Container::new_array();
306
307 for &val in values {
308 let high = (val >> 16) as u16;
309 let low = val as u16;
310
311 if high != current_high {
312 current_container.optimize();
313 bitmap.containers.push((current_high, current_container));
314 current_high = high;
315 current_container = Container::new_array();
316 }
317
318 current_container.insert(low);
319 }
320
321 current_container.optimize();
322 bitmap.containers.push((current_high, current_container));
323
324 bitmap
325 }
326
327 pub fn insert(&mut self, val: u32) -> bool {
329 let high = (val >> 16) as u16;
330 let low = val as u16;
331
332 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334 Ok(idx) => self.containers[idx].1.insert(low),
335 Err(idx) => {
336 let mut container = Container::new_array();
337 container.insert(low);
338 self.containers.insert(idx, (high, container));
339 true
340 }
341 }
342 }
343
344 pub fn contains(&self, val: u32) -> bool {
346 let high = (val >> 16) as u16;
347 let low = val as u16;
348
349 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350 Ok(idx) => self.containers[idx].1.contains(low),
351 Err(_) => false,
352 }
353 }
354
355 pub fn cardinality(&self) -> u32 {
357 self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358 }
359
360 pub fn is_empty(&self) -> bool {
362 self.containers.is_empty()
363 }
364
365 pub fn optimize(&mut self) {
367 for (_, container) in &mut self.containers {
368 container.optimize();
369 }
370 }
371
372 pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374 let mut result = RoaringBitmap::new();
375
376 let mut i = 0;
377 let mut j = 0;
378
379 while i < self.containers.len() && j < other.containers.len() {
380 let (high1, c1) = &self.containers[i];
381 let (high2, c2) = &other.containers[j];
382
383 match high1.cmp(high2) {
384 std::cmp::Ordering::Less => i += 1,
385 std::cmp::Ordering::Greater => j += 1,
386 std::cmp::Ordering::Equal => {
387 let intersected = Self::intersect_containers(c1, c2);
388 if intersected.cardinality() > 0 {
389 result.containers.push((*high1, intersected));
390 }
391 i += 1;
392 j += 1;
393 }
394 }
395 }
396
397 result
398 }
399
400 fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401 match (c1, c2) {
402 (Container::Array(a1), Container::Array(a2)) => {
403 let mut result = Vec::new();
404 let mut i = 0;
405 let mut j = 0;
406 while i < a1.len() && j < a2.len() {
407 match a1[i].cmp(&a2[j]) {
408 std::cmp::Ordering::Less => i += 1,
409 std::cmp::Ordering::Greater => j += 1,
410 std::cmp::Ordering::Equal => {
411 result.push(a1[i]);
412 i += 1;
413 j += 1;
414 }
415 }
416 }
417 Container::Array(result)
418 }
419 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420 let mut result = Box::new([0u64; 1024]);
421 for i in 0..1024 {
422 result[i] = b1[i] & b2[i];
423 }
424 let mut c = Container::Bitmap(result);
425 c.optimize();
426 c
427 }
428 (Container::Array(arr), Container::Bitmap(bm))
429 | (Container::Bitmap(bm), Container::Array(arr)) => {
430 let mut result = Vec::new();
431 for &val in arr {
432 let word_idx = (val / 64) as usize;
433 let bit_idx = val % 64;
434 if (bm[word_idx] >> bit_idx) & 1 == 1 {
435 result.push(val);
436 }
437 }
438 Container::Array(result)
439 }
440 _ => {
441 let arr1 = Self::container_to_array(c1);
443 let arr2 = Self::container_to_array(c2);
444 Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445 }
446 }
447 }
448
449 fn container_to_array(c: &Container) -> Vec<u16> {
450 match c {
451 Container::Array(arr) => arr.clone(),
452 Container::Bitmap(bm) => {
453 let mut arr = Vec::new();
454 for (word_idx, &word) in bm.iter().enumerate() {
455 let mut w = word;
456 while w != 0 {
457 let bit_idx = w.trailing_zeros();
458 arr.push((word_idx * 64 + bit_idx as usize) as u16);
459 w &= w - 1;
460 }
461 }
462 arr
463 }
464 Container::Runs(runs) => {
465 let mut arr = Vec::new();
466 for &(start, len) in runs {
467 for i in 0..=len {
468 arr.push(start + i);
469 }
470 }
471 arr
472 }
473 }
474 }
475
476 pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478 let mut result = RoaringBitmap::new();
479
480 let mut i = 0;
481 let mut j = 0;
482
483 while i < self.containers.len() || j < other.containers.len() {
484 if i >= self.containers.len() {
485 result.containers.push(other.containers[j].clone());
486 j += 1;
487 } else if j >= other.containers.len() {
488 result.containers.push(self.containers[i].clone());
489 i += 1;
490 } else {
491 let (high1, c1) = &self.containers[i];
492 let (high2, c2) = &other.containers[j];
493
494 match high1.cmp(high2) {
495 std::cmp::Ordering::Less => {
496 result.containers.push(self.containers[i].clone());
497 i += 1;
498 }
499 std::cmp::Ordering::Greater => {
500 result.containers.push(other.containers[j].clone());
501 j += 1;
502 }
503 std::cmp::Ordering::Equal => {
504 let united = Self::union_containers(c1, c2);
505 result.containers.push((*high1, united));
506 i += 1;
507 j += 1;
508 }
509 }
510 }
511 }
512
513 result
514 }
515
516 fn union_containers(c1: &Container, c2: &Container) -> Container {
517 match (c1, c2) {
518 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519 let mut result = Box::new([0u64; 1024]);
520 for i in 0..1024 {
521 result[i] = b1[i] | b2[i];
522 }
523 Container::Bitmap(result)
524 }
525 _ => {
526 let arr1 = Self::container_to_array(c1);
528 let arr2 = Self::container_to_array(c2);
529
530 let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531 let mut i = 0;
532 let mut j = 0;
533
534 while i < arr1.len() && j < arr2.len() {
535 match arr1[i].cmp(&arr2[j]) {
536 std::cmp::Ordering::Less => {
537 result.push(arr1[i]);
538 i += 1;
539 }
540 std::cmp::Ordering::Greater => {
541 result.push(arr2[j]);
542 j += 1;
543 }
544 std::cmp::Ordering::Equal => {
545 result.push(arr1[i]);
546 i += 1;
547 j += 1;
548 }
549 }
550 }
551
552 result.extend_from_slice(&arr1[i..]);
553 result.extend_from_slice(&arr2[j..]);
554
555 let mut c = Container::Array(result);
556 c.optimize();
557 c
558 }
559 }
560 }
561
562 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564 writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565 for (high, container) in &self.containers {
566 writer.write_u16::<LittleEndian>(*high)?;
567 container.serialize(writer)?;
568 }
569 Ok(())
570 }
571
572 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574 let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575 let mut containers = Vec::with_capacity(num_containers);
576
577 for _ in 0..num_containers {
578 let high = reader.read_u16::<LittleEndian>()?;
579 let container = Container::deserialize(reader)?;
580 containers.push((high, container));
581 }
582
583 Ok(Self { containers })
584 }
585
586 pub fn size_bytes(&self) -> usize {
588 4 + self
589 .containers
590 .iter()
591 .map(|(_, c)| 2 + c.size_bytes())
592 .sum::<usize>()
593 }
594
595 pub fn iter(&self) -> RoaringIterator<'_> {
597 RoaringIterator {
598 bitmap: self,
599 container_idx: 0,
600 value_idx: 0,
601 current_values: Vec::new(),
602 }
603 }
604}
605
606impl Default for RoaringBitmap {
607 fn default() -> Self {
608 Self::new()
609 }
610}
611
612pub struct RoaringIterator<'a> {
614 bitmap: &'a RoaringBitmap,
615 container_idx: usize,
616 value_idx: usize,
617 current_values: Vec<u16>,
618}
619
620impl<'a> Iterator for RoaringIterator<'a> {
621 type Item = u32;
622
623 fn next(&mut self) -> Option<Self::Item> {
624 loop {
625 if self.value_idx < self.current_values.len() {
626 let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
627 let low = self.current_values[self.value_idx] as u32;
628 self.value_idx += 1;
629 return Some((high << 16) | low);
630 }
631
632 if self.container_idx >= self.bitmap.containers.len() {
633 return None;
634 }
635
636 let (_, container) = &self.bitmap.containers[self.container_idx];
637 self.current_values = RoaringBitmap::container_to_array(container);
638 self.value_idx = 0;
639 self.container_idx += 1;
640 }
641 }
642}
643
644pub const ROARING_BLOCK_SIZE: usize = 65536;
646
647#[derive(Debug, Clone)]
649pub struct RoaringBlockInfo {
650 pub container_key: u16,
652 pub first_doc_id: u32,
654 pub last_doc_id: u32,
656 pub max_tf: u32,
658 pub max_block_score: f32,
660 pub num_docs: u32,
662}
663
664impl RoaringBlockInfo {
665 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
667 writer.write_u16::<LittleEndian>(self.container_key)?;
668 writer.write_u32::<LittleEndian>(self.first_doc_id)?;
669 writer.write_u32::<LittleEndian>(self.last_doc_id)?;
670 writer.write_u32::<LittleEndian>(self.max_tf)?;
671 writer.write_f32::<LittleEndian>(self.max_block_score)?;
672 writer.write_u32::<LittleEndian>(self.num_docs)?;
673 Ok(())
674 }
675
676 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
678 Ok(Self {
679 container_key: reader.read_u16::<LittleEndian>()?,
680 first_doc_id: reader.read_u32::<LittleEndian>()?,
681 last_doc_id: reader.read_u32::<LittleEndian>()?,
682 max_tf: reader.read_u32::<LittleEndian>()?,
683 max_block_score: reader.read_f32::<LittleEndian>()?,
684 num_docs: reader.read_u32::<LittleEndian>()?,
685 })
686 }
687}
688
689#[derive(Debug, Clone)]
691pub struct RoaringPostingList {
692 pub doc_ids: RoaringBitmap,
694 pub term_freqs: Vec<(u32, u32)>,
697 pub default_tf: u32,
699 pub max_tf: u32,
701 pub blocks: Vec<RoaringBlockInfo>,
703 pub max_score: f32,
705}
706
707impl RoaringPostingList {
708 const K1: f32 = 1.2;
710 const B: f32 = 0.75;
711
712 #[inline]
714 pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
715 let tf = max_tf as f32;
716 let min_length_norm = 1.0 - Self::B;
718 let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
719 idf * tf_norm
720 }
721
722 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
724 Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
725 }
726
727 pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
729 assert_eq!(doc_ids.len(), term_freqs.len());
730
731 if doc_ids.is_empty() {
732 return Self {
733 doc_ids: RoaringBitmap::new(),
734 term_freqs: Vec::new(),
735 default_tf: 1,
736 max_tf: 0,
737 blocks: Vec::new(),
738 max_score: 0.0,
739 };
740 }
741
742 let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
743
744 let mut tf_counts = std::collections::HashMap::new();
746 for &tf in term_freqs {
747 *tf_counts.entry(tf).or_insert(0u32) += 1;
748 }
749 let default_tf = tf_counts
750 .iter()
751 .max_by_key(|&(_, count)| count)
752 .map(|(&tf, _)| tf)
753 .unwrap_or(1);
754
755 let exceptions: Vec<(u32, u32)> = doc_ids
757 .iter()
758 .zip(term_freqs.iter())
759 .filter(|&(_, &tf)| tf != default_tf)
760 .map(|(&doc, &tf)| (doc, tf))
761 .collect();
762
763 let max_tf = *term_freqs.iter().max().unwrap_or(&1);
764
765 let mut blocks = Vec::new();
768 let mut max_score = 0.0f32;
769 let mut i = 0;
770
771 while i < doc_ids.len() {
772 let container_key = (doc_ids[i] >> 16) as u16;
773 let block_start = i;
774
775 while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
777 i += 1;
778 }
779
780 let block_doc_ids = &doc_ids[block_start..i];
781 let block_tfs = &term_freqs[block_start..i];
782 let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
783 let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
784 max_score = max_score.max(block_score);
785
786 blocks.push(RoaringBlockInfo {
787 container_key,
788 first_doc_id: block_doc_ids[0],
789 last_doc_id: *block_doc_ids.last().unwrap(),
790 max_tf: block_max_tf,
791 max_block_score: block_score,
792 num_docs: block_doc_ids.len() as u32,
793 });
794 }
795
796 Self {
797 doc_ids: bitmap,
798 term_freqs: exceptions,
799 default_tf,
800 max_tf,
801 blocks,
802 max_score,
803 }
804 }
805
806 pub fn get_tf(&self, doc_id: u32) -> u32 {
808 match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
810 Ok(idx) => self.term_freqs[idx].1,
811 Err(_) => self.default_tf,
812 }
813 }
814
815 pub fn len(&self) -> u32 {
817 self.doc_ids.cardinality()
818 }
819
820 pub fn is_empty(&self) -> bool {
822 self.doc_ids.is_empty()
823 }
824
825 pub fn num_blocks(&self) -> usize {
827 self.blocks.len()
828 }
829
830 pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
832 let container_key = (doc_id >> 16) as u16;
833 self.blocks
834 .binary_search_by_key(&container_key, |b| b.container_key)
835 .ok()
836 }
837
838 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
840 self.doc_ids.serialize(writer)?;
841 writer.write_u32::<LittleEndian>(self.default_tf)?;
842 writer.write_u32::<LittleEndian>(self.max_tf)?;
843 writer.write_f32::<LittleEndian>(self.max_score)?;
844 writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
845 for &(doc, tf) in &self.term_freqs {
846 writer.write_u32::<LittleEndian>(doc)?;
847 writer.write_u32::<LittleEndian>(tf)?;
848 }
849
850 writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
852 for block in &self.blocks {
853 block.serialize(writer)?;
854 }
855
856 Ok(())
857 }
858
859 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
861 let doc_ids = RoaringBitmap::deserialize(reader)?;
862 let default_tf = reader.read_u32::<LittleEndian>()?;
863 let max_tf = reader.read_u32::<LittleEndian>()?;
864 let max_score = reader.read_f32::<LittleEndian>()?;
865 let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
866 let mut term_freqs = Vec::with_capacity(num_exceptions);
867 for _ in 0..num_exceptions {
868 let doc = reader.read_u32::<LittleEndian>()?;
869 let tf = reader.read_u32::<LittleEndian>()?;
870 term_freqs.push((doc, tf));
871 }
872
873 let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
875 let mut blocks = Vec::with_capacity(num_blocks);
876 for _ in 0..num_blocks {
877 blocks.push(RoaringBlockInfo::deserialize(reader)?);
878 }
879
880 Ok(Self {
881 doc_ids,
882 term_freqs,
883 default_tf,
884 max_tf,
885 blocks,
886 max_score,
887 })
888 }
889
890 pub fn iterator(&self) -> RoaringPostingIterator<'_> {
892 RoaringPostingIterator {
893 list: self,
894 doc_iter: self.doc_ids.iter(),
895 current_doc: None,
896 current_block: 0,
897 }
898 }
899}
900
901pub struct RoaringPostingIterator<'a> {
903 list: &'a RoaringPostingList,
904 doc_iter: RoaringIterator<'a>,
905 current_doc: Option<u32>,
906 current_block: usize,
907}
908
909impl<'a> RoaringPostingIterator<'a> {
910 pub fn doc(&self) -> u32 {
912 self.current_doc.unwrap_or(u32::MAX)
913 }
914
915 pub fn term_freq(&self) -> u32 {
917 match self.current_doc {
918 Some(doc) => self.list.get_tf(doc),
919 None => 0,
920 }
921 }
922
923 pub fn advance(&mut self) -> u32 {
925 self.current_doc = self.doc_iter.next();
926 if let Some(doc) = self.current_doc
928 && !self.list.blocks.is_empty()
929 {
930 let container_key = (doc >> 16) as u16;
931 while self.current_block < self.list.blocks.len()
933 && self.list.blocks[self.current_block].container_key < container_key
934 {
935 self.current_block += 1;
936 }
937 }
938 self.doc()
939 }
940
941 pub fn init(&mut self) {
943 self.current_doc = self.doc_iter.next();
944 self.current_block = 0;
945 }
946
947 pub fn seek(&mut self, target: u32) -> u32 {
949 if !self.list.blocks.is_empty() {
951 let target_container = (target >> 16) as u16;
952
953 while self.current_block < self.list.blocks.len()
955 && self.list.blocks[self.current_block].last_doc_id < target
956 {
957 self.current_block += 1;
958 }
959
960 if self.current_block >= self.list.blocks.len() {
962 self.current_doc = None;
963 return u32::MAX;
964 }
965
966 let block = &self.list.blocks[self.current_block];
968 if block.container_key > target_container
969 || (block.container_key == target_container && block.first_doc_id > self.doc())
970 {
971 while let Some(doc) = self.current_doc {
973 if doc >= block.first_doc_id {
974 break;
975 }
976 self.current_doc = self.doc_iter.next();
977 }
978 }
979 }
980
981 while let Some(doc) = self.current_doc {
983 if doc >= target {
984 return doc;
985 }
986 self.current_doc = self.doc_iter.next();
987 }
988 u32::MAX
989 }
990
991 pub fn is_exhausted(&self) -> bool {
993 self.current_doc.is_none()
994 }
995
996 pub fn current_block_max_score(&self) -> f32 {
998 if self.current_doc.is_none() || self.list.blocks.is_empty() {
999 return 0.0;
1000 }
1001 if self.current_block < self.list.blocks.len() {
1002 self.list.blocks[self.current_block].max_block_score
1003 } else {
1004 0.0
1005 }
1006 }
1007
1008 pub fn current_block_max_tf(&self) -> u32 {
1010 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1011 return 0;
1012 }
1013 if self.current_block < self.list.blocks.len() {
1014 self.list.blocks[self.current_block].max_tf
1015 } else {
1016 0
1017 }
1018 }
1019
1020 pub fn max_remaining_score(&self) -> f32 {
1022 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1023 return 0.0;
1024 }
1025 self.list.blocks[self.current_block..]
1026 .iter()
1027 .map(|b| b.max_block_score)
1028 .fold(0.0f32, |a, b| a.max(b))
1029 }
1030
1031 pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1034 if self.list.blocks.is_empty() {
1035 return None;
1036 }
1037
1038 while self.current_block < self.list.blocks.len() {
1039 let block = &self.list.blocks[self.current_block];
1040 if block.last_doc_id >= target {
1041 while let Some(doc) = self.current_doc {
1043 if doc >= block.first_doc_id {
1044 break;
1045 }
1046 self.current_doc = self.doc_iter.next();
1047 }
1048 return Some((block.first_doc_id, block.max_block_score));
1049 }
1050 self.current_block += 1;
1051 }
1052
1053 self.current_doc = None;
1054 None
1055 }
1056}
1057
1058#[cfg(test)]
1059mod tests {
1060 use super::*;
1061
1062 #[test]
1063 fn test_roaring_basic() {
1064 let mut bitmap = RoaringBitmap::new();
1065 bitmap.insert(1);
1066 bitmap.insert(100);
1067 bitmap.insert(1000);
1068 bitmap.insert(100000);
1069
1070 assert!(bitmap.contains(1));
1071 assert!(bitmap.contains(100));
1072 assert!(bitmap.contains(1000));
1073 assert!(bitmap.contains(100000));
1074 assert!(!bitmap.contains(2));
1075 assert!(!bitmap.contains(50000));
1076
1077 assert_eq!(bitmap.cardinality(), 4);
1078 }
1079
1080 #[test]
1081 fn test_roaring_from_sorted() {
1082 let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1083 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1084
1085 assert_eq!(bitmap.cardinality(), 10000);
1086
1087 for &val in &values {
1088 assert!(bitmap.contains(val), "Missing value {}", val);
1089 }
1090 }
1091
1092 #[test]
1093 fn test_roaring_intersection() {
1094 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1095 let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1096
1097 let result = a.and(&b);
1098
1099 assert_eq!(result.cardinality(), 4);
1100 assert!(result.contains(2));
1101 assert!(result.contains(3));
1102 assert!(result.contains(200));
1103 assert!(result.contains(300));
1104 }
1105
1106 #[test]
1107 fn test_roaring_union() {
1108 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1109 let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1110
1111 let result = a.or(&b);
1112
1113 assert_eq!(result.cardinality(), 5);
1114 for i in 1..=5 {
1115 assert!(result.contains(i));
1116 }
1117 }
1118
1119 #[test]
1120 fn test_roaring_serialization() {
1121 let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1122 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1123
1124 let mut buffer = Vec::new();
1125 bitmap.serialize(&mut buffer).unwrap();
1126
1127 let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1128
1129 assert_eq!(restored.cardinality(), bitmap.cardinality());
1130 for &val in &values {
1131 assert!(restored.contains(val));
1132 }
1133 }
1134
1135 #[test]
1136 fn test_roaring_posting_list() {
1137 let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1138 let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1139
1140 let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1141
1142 assert_eq!(list.len(), 7);
1143 assert_eq!(list.default_tf, 1);
1144 assert_eq!(list.max_tf, 3);
1145
1146 assert_eq!(list.get_tf(1), 1);
1148 assert_eq!(list.get_tf(10), 2);
1149 assert_eq!(list.get_tf(100), 3);
1150 }
1151
1152 #[test]
1153 fn test_roaring_iterator() {
1154 let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1155 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1156
1157 let collected: Vec<u32> = bitmap.iter().collect();
1158 assert_eq!(collected, values);
1159 }
1160
1161 #[test]
1162 fn test_roaring_block_max() {
1163 let mut doc_ids = Vec::new();
1168 let mut term_freqs = Vec::new();
1169
1170 for i in 0..100 {
1172 doc_ids.push(i * 100);
1173 term_freqs.push(if i == 50 { 2 } else { 1 });
1174 }
1175
1176 for i in 0..100 {
1178 doc_ids.push(65536 + i * 100);
1179 term_freqs.push(if i == 25 { 5 } else { 1 });
1180 }
1181
1182 for i in 0..100 {
1184 doc_ids.push(131072 + i * 100);
1185 term_freqs.push(if i == 75 { 3 } else { 1 });
1186 }
1187
1188 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1189
1190 assert_eq!(list.num_blocks(), 3);
1192 assert_eq!(list.blocks[0].container_key, 0);
1193 assert_eq!(list.blocks[1].container_key, 1);
1194 assert_eq!(list.blocks[2].container_key, 2);
1195
1196 assert_eq!(list.blocks[0].max_tf, 2);
1197 assert_eq!(list.blocks[1].max_tf, 5);
1198 assert_eq!(list.blocks[2].max_tf, 3);
1199
1200 assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1202 assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1203
1204 assert_eq!(list.max_score, list.blocks[1].max_block_score);
1206
1207 let mut iter = list.iterator();
1209 iter.init();
1210 assert_eq!(iter.current_block_max_tf(), 2); iter.seek(65536);
1214 assert_eq!(iter.current_block_max_tf(), 5);
1215
1216 iter.seek(131072);
1218 assert_eq!(iter.current_block_max_tf(), 3);
1219 }
1220
1221 #[test]
1222 fn test_roaring_block_max_serialization() {
1223 let mut doc_ids = Vec::new();
1224 let mut term_freqs = Vec::new();
1225
1226 for i in 0..50 {
1228 doc_ids.push(i * 10);
1229 term_freqs.push((i % 5) + 1);
1230 }
1231 for i in 0..50 {
1232 doc_ids.push(65536 + i * 10);
1233 term_freqs.push((i % 3) + 1);
1234 }
1235
1236 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1237
1238 let mut buffer = Vec::new();
1239 list.serialize(&mut buffer).unwrap();
1240
1241 let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1242
1243 assert_eq!(restored.len(), list.len());
1244 assert_eq!(restored.max_tf, list.max_tf);
1245 assert_eq!(restored.max_score, list.max_score);
1246 assert_eq!(restored.num_blocks(), list.num_blocks());
1247
1248 for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1250 assert_eq!(orig.container_key, rest.container_key);
1251 assert_eq!(orig.first_doc_id, rest.first_doc_id);
1252 assert_eq!(orig.last_doc_id, rest.last_doc_id);
1253 assert_eq!(orig.max_tf, rest.max_tf);
1254 assert_eq!(orig.max_block_score, rest.max_block_score);
1255 }
1256
1257 let mut iter1 = list.iterator();
1259 let mut iter2 = restored.iterator();
1260 iter1.init();
1261 iter2.init();
1262
1263 while !iter1.is_exhausted() {
1264 assert_eq!(iter1.doc(), iter2.doc());
1265 assert_eq!(iter1.term_freq(), iter2.term_freq());
1266 iter1.advance();
1267 iter2.advance();
1268 }
1269 assert!(iter2.is_exhausted());
1270 }
1271}