1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18#[derive(Debug, Clone)]
20enum Container {
21 Array(Vec<u16>),
23 Bitmap(Box<[u64; 1024]>),
25 Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30 fn new_array() -> Self {
31 Container::Array(Vec::new())
32 }
33
34 #[allow(dead_code)]
35 fn new_bitmap() -> Self {
36 Container::Bitmap(Box::new([0u64; 1024]))
37 }
38
39 fn cardinality(&self) -> u32 {
40 match self {
41 Container::Array(arr) => arr.len() as u32,
42 Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43 Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44 }
45 }
46
47 #[inline]
49 fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50 #[cfg(target_arch = "aarch64")]
51 {
52 use std::arch::aarch64::*;
53
54 let mut total = 0u32;
55 let bytes = bm.as_ptr() as *const u8;
56
57 unsafe {
59 for i in (0..8192).step_by(16) {
60 let v = vld1q_u8(bytes.add(i));
61 let cnt = vcntq_u8(v);
62 total += vaddlvq_u8(cnt) as u32;
63 }
64 }
65
66 total
67 }
68
69 #[cfg(target_arch = "x86_64")]
70 {
71 let mut total = 0u32;
73 for &word in bm.iter() {
74 total += word.count_ones();
75 }
76 total
77 }
78
79 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80 {
81 bm.iter().map(|w| w.count_ones()).sum()
82 }
83 }
84
85 fn contains(&self, val: u16) -> bool {
86 match self {
87 Container::Array(arr) => arr.binary_search(&val).is_ok(),
88 Container::Bitmap(bm) => {
89 let word_idx = (val / 64) as usize;
90 let bit_idx = val % 64;
91 (bm[word_idx] >> bit_idx) & 1 == 1
92 }
93 Container::Runs(runs) => {
94 for &(start, len) in runs {
95 if val >= start && val <= start + len {
96 return true;
97 }
98 if val < start {
99 return false;
100 }
101 }
102 false
103 }
104 }
105 }
106
107 fn insert(&mut self, val: u16) -> bool {
108 match self {
109 Container::Array(arr) => {
110 match arr.binary_search(&val) {
111 Ok(_) => false, Err(pos) => {
113 arr.insert(pos, val);
114 true
115 }
116 }
117 }
118 Container::Bitmap(bm) => {
119 let word_idx = (val / 64) as usize;
120 let bit_idx = val % 64;
121 let old = bm[word_idx];
122 bm[word_idx] |= 1u64 << bit_idx;
123 old != bm[word_idx]
124 }
125 Container::Runs(runs) => {
126 let mut arr: Vec<u16> = Vec::new();
128 for &(start, len) in runs.iter() {
129 for i in 0..=len {
130 arr.push(start + i);
131 }
132 }
133 let inserted = match arr.binary_search(&val) {
134 Ok(_) => false,
135 Err(pos) => {
136 arr.insert(pos, val);
137 true
138 }
139 };
140 *self = Container::Array(arr);
141 inserted
142 }
143 }
144 }
145
146 fn optimize(&mut self) {
147 let card = self.cardinality() as usize;
148
149 match self {
150 Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151 let mut bm = Box::new([0u64; 1024]);
153 for &val in arr.iter() {
154 let word_idx = (val / 64) as usize;
155 let bit_idx = val % 64;
156 bm[word_idx] |= 1u64 << bit_idx;
157 }
158 *self = Container::Bitmap(bm);
159 }
160 Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161 let mut arr = Vec::with_capacity(card);
163 for (word_idx, &word) in bm.iter().enumerate() {
164 let mut w = word;
165 while w != 0 {
166 let bit_idx = w.trailing_zeros();
167 arr.push((word_idx * 64 + bit_idx as usize) as u16);
168 w &= w - 1;
169 }
170 }
171 *self = Container::Array(arr);
172 }
173 _ => {}
174 }
175
176 self.try_run_encode();
178 }
179
180 fn try_run_encode(&mut self) {
181 let arr = match self {
182 Container::Array(arr) if arr.len() >= 4 => arr,
183 _ => return,
184 };
185
186 let mut runs = Vec::new();
187 let mut i = 0;
188
189 while i < arr.len() {
190 let start = arr[i];
191 let mut end = start;
192
193 while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194 end = arr[i + 1];
195 i += 1;
196 }
197
198 runs.push((start, end - start));
199 i += 1;
200 }
201
202 if runs.len() * 4 < arr.len() * 2 {
206 *self = Container::Runs(runs);
207 }
208 }
209
210 fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211 match self {
212 Container::Array(arr) => {
213 writer.write_u8(0)?; writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215 for &val in arr {
216 writer.write_u16::<LittleEndian>(val)?;
217 }
218 }
219 Container::Bitmap(bm) => {
220 writer.write_u8(1)?; for &word in bm.iter() {
222 writer.write_u64::<LittleEndian>(word)?;
223 }
224 }
225 Container::Runs(runs) => {
226 writer.write_u8(2)?; writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228 for &(start, len) in runs {
229 writer.write_u16::<LittleEndian>(start)?;
230 writer.write_u16::<LittleEndian>(len)?;
231 }
232 }
233 }
234 Ok(())
235 }
236
237 fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238 let tag = reader.read_u8()?;
239 match tag {
240 0 => {
241 let len = reader.read_u16::<LittleEndian>()? as usize;
242 let mut arr = Vec::with_capacity(len);
243 for _ in 0..len {
244 arr.push(reader.read_u16::<LittleEndian>()?);
245 }
246 Ok(Container::Array(arr))
247 }
248 1 => {
249 let mut bm = Box::new([0u64; 1024]);
250 for word in bm.iter_mut() {
251 *word = reader.read_u64::<LittleEndian>()?;
252 }
253 Ok(Container::Bitmap(bm))
254 }
255 2 => {
256 let len = reader.read_u16::<LittleEndian>()? as usize;
257 let mut runs = Vec::with_capacity(len);
258 for _ in 0..len {
259 let start = reader.read_u16::<LittleEndian>()?;
260 let run_len = reader.read_u16::<LittleEndian>()?;
261 runs.push((start, run_len));
262 }
263 Ok(Container::Runs(runs))
264 }
265 _ => Err(io::Error::new(
266 io::ErrorKind::InvalidData,
267 "Invalid container type",
268 )),
269 }
270 }
271
272 fn size_bytes(&self) -> usize {
273 match self {
274 Container::Array(arr) => arr.len() * 2 + 4,
275 Container::Bitmap(_) => 8 * 1024 + 1,
276 Container::Runs(runs) => runs.len() * 4 + 4,
277 }
278 }
279}
280
281#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284 containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289 pub fn new() -> Self {
291 Self {
292 containers: Vec::new(),
293 }
294 }
295
296 pub fn from_sorted_slice(values: &[u32]) -> Self {
298 let mut bitmap = Self::new();
299
300 if values.is_empty() {
301 return bitmap;
302 }
303
304 let mut current_high = (values[0] >> 16) as u16;
305 let mut current_container = Container::new_array();
306
307 for &val in values {
308 let high = (val >> 16) as u16;
309 let low = val as u16;
310
311 if high != current_high {
312 current_container.optimize();
313 bitmap.containers.push((current_high, current_container));
314 current_high = high;
315 current_container = Container::new_array();
316 }
317
318 current_container.insert(low);
319 }
320
321 current_container.optimize();
322 bitmap.containers.push((current_high, current_container));
323
324 bitmap
325 }
326
327 pub fn insert(&mut self, val: u32) -> bool {
329 let high = (val >> 16) as u16;
330 let low = val as u16;
331
332 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334 Ok(idx) => self.containers[idx].1.insert(low),
335 Err(idx) => {
336 let mut container = Container::new_array();
337 container.insert(low);
338 self.containers.insert(idx, (high, container));
339 true
340 }
341 }
342 }
343
344 pub fn contains(&self, val: u32) -> bool {
346 let high = (val >> 16) as u16;
347 let low = val as u16;
348
349 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350 Ok(idx) => self.containers[idx].1.contains(low),
351 Err(_) => false,
352 }
353 }
354
355 pub fn cardinality(&self) -> u32 {
357 self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358 }
359
360 pub fn is_empty(&self) -> bool {
362 self.containers.is_empty()
363 }
364
365 pub fn optimize(&mut self) {
367 for (_, container) in &mut self.containers {
368 container.optimize();
369 }
370 }
371
372 pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374 let mut result = RoaringBitmap::new();
375
376 let mut i = 0;
377 let mut j = 0;
378
379 while i < self.containers.len() && j < other.containers.len() {
380 let (high1, c1) = &self.containers[i];
381 let (high2, c2) = &other.containers[j];
382
383 match high1.cmp(high2) {
384 std::cmp::Ordering::Less => i += 1,
385 std::cmp::Ordering::Greater => j += 1,
386 std::cmp::Ordering::Equal => {
387 let intersected = Self::intersect_containers(c1, c2);
388 if intersected.cardinality() > 0 {
389 result.containers.push((*high1, intersected));
390 }
391 i += 1;
392 j += 1;
393 }
394 }
395 }
396
397 result
398 }
399
400 fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401 match (c1, c2) {
402 (Container::Array(a1), Container::Array(a2)) => {
403 let mut result = Vec::new();
404 let mut i = 0;
405 let mut j = 0;
406 while i < a1.len() && j < a2.len() {
407 match a1[i].cmp(&a2[j]) {
408 std::cmp::Ordering::Less => i += 1,
409 std::cmp::Ordering::Greater => j += 1,
410 std::cmp::Ordering::Equal => {
411 result.push(a1[i]);
412 i += 1;
413 j += 1;
414 }
415 }
416 }
417 Container::Array(result)
418 }
419 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420 let mut result = Box::new([0u64; 1024]);
421 for i in 0..1024 {
422 result[i] = b1[i] & b2[i];
423 }
424 let mut c = Container::Bitmap(result);
425 c.optimize();
426 c
427 }
428 (Container::Array(arr), Container::Bitmap(bm))
429 | (Container::Bitmap(bm), Container::Array(arr)) => {
430 let mut result = Vec::new();
431 for &val in arr {
432 let word_idx = (val / 64) as usize;
433 let bit_idx = val % 64;
434 if (bm[word_idx] >> bit_idx) & 1 == 1 {
435 result.push(val);
436 }
437 }
438 Container::Array(result)
439 }
440 _ => {
441 let arr1 = Self::container_to_array(c1);
443 let arr2 = Self::container_to_array(c2);
444 Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445 }
446 }
447 }
448
449 fn container_to_array(c: &Container) -> Vec<u16> {
450 match c {
451 Container::Array(arr) => arr.clone(),
452 Container::Bitmap(bm) => {
453 let mut arr = Vec::new();
454 for (word_idx, &word) in bm.iter().enumerate() {
455 let mut w = word;
456 while w != 0 {
457 let bit_idx = w.trailing_zeros();
458 arr.push((word_idx * 64 + bit_idx as usize) as u16);
459 w &= w - 1;
460 }
461 }
462 arr
463 }
464 Container::Runs(runs) => {
465 let mut arr = Vec::new();
466 for &(start, len) in runs {
467 for i in 0..=len {
468 arr.push(start + i);
469 }
470 }
471 arr
472 }
473 }
474 }
475
476 pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478 let mut result = RoaringBitmap::new();
479
480 let mut i = 0;
481 let mut j = 0;
482
483 while i < self.containers.len() || j < other.containers.len() {
484 if i >= self.containers.len() {
485 result.containers.push(other.containers[j].clone());
486 j += 1;
487 } else if j >= other.containers.len() {
488 result.containers.push(self.containers[i].clone());
489 i += 1;
490 } else {
491 let (high1, c1) = &self.containers[i];
492 let (high2, c2) = &other.containers[j];
493
494 match high1.cmp(high2) {
495 std::cmp::Ordering::Less => {
496 result.containers.push(self.containers[i].clone());
497 i += 1;
498 }
499 std::cmp::Ordering::Greater => {
500 result.containers.push(other.containers[j].clone());
501 j += 1;
502 }
503 std::cmp::Ordering::Equal => {
504 let united = Self::union_containers(c1, c2);
505 result.containers.push((*high1, united));
506 i += 1;
507 j += 1;
508 }
509 }
510 }
511 }
512
513 result
514 }
515
516 fn union_containers(c1: &Container, c2: &Container) -> Container {
517 match (c1, c2) {
518 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519 let mut result = Box::new([0u64; 1024]);
520 for i in 0..1024 {
521 result[i] = b1[i] | b2[i];
522 }
523 Container::Bitmap(result)
524 }
525 _ => {
526 let arr1 = Self::container_to_array(c1);
528 let arr2 = Self::container_to_array(c2);
529
530 let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531 let mut i = 0;
532 let mut j = 0;
533
534 while i < arr1.len() && j < arr2.len() {
535 match arr1[i].cmp(&arr2[j]) {
536 std::cmp::Ordering::Less => {
537 result.push(arr1[i]);
538 i += 1;
539 }
540 std::cmp::Ordering::Greater => {
541 result.push(arr2[j]);
542 j += 1;
543 }
544 std::cmp::Ordering::Equal => {
545 result.push(arr1[i]);
546 i += 1;
547 j += 1;
548 }
549 }
550 }
551
552 result.extend_from_slice(&arr1[i..]);
553 result.extend_from_slice(&arr2[j..]);
554
555 let mut c = Container::Array(result);
556 c.optimize();
557 c
558 }
559 }
560 }
561
562 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564 writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565 for (high, container) in &self.containers {
566 writer.write_u16::<LittleEndian>(*high)?;
567 container.serialize(writer)?;
568 }
569 Ok(())
570 }
571
572 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574 let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575 let mut containers = Vec::with_capacity(num_containers);
576
577 for _ in 0..num_containers {
578 let high = reader.read_u16::<LittleEndian>()?;
579 let container = Container::deserialize(reader)?;
580 containers.push((high, container));
581 }
582
583 Ok(Self { containers })
584 }
585
586 pub fn size_bytes(&self) -> usize {
588 4 + self
589 .containers
590 .iter()
591 .map(|(_, c)| 2 + c.size_bytes())
592 .sum::<usize>()
593 }
594
595 pub fn iter(&self) -> RoaringIterator<'_> {
597 RoaringIterator {
599 bitmap: self,
600 container_idx: 0,
601 value_idx: 0,
602 current_len: 0,
603 current_values: Vec::with_capacity(ARRAY_TO_BITMAP_THRESHOLD),
604 }
605 }
606}
607
608impl Default for RoaringBitmap {
609 fn default() -> Self {
610 Self::new()
611 }
612}
613
614pub struct RoaringIterator<'a> {
616 bitmap: &'a RoaringBitmap,
617 container_idx: usize,
618 value_idx: usize,
619 current_len: usize,
621 current_values: Vec<u16>,
623}
624
625impl<'a> Iterator for RoaringIterator<'a> {
626 type Item = u32;
627
628 fn next(&mut self) -> Option<Self::Item> {
629 loop {
630 if self.value_idx < self.current_len {
631 let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
632 let low = self.current_values[self.value_idx] as u32;
633 self.value_idx += 1;
634 return Some((high << 16) | low);
635 }
636
637 if self.container_idx >= self.bitmap.containers.len() {
638 return None;
639 }
640
641 let (_, container) = &self.bitmap.containers[self.container_idx];
642 self.current_len = Self::decode_container_into(container, &mut self.current_values);
644 self.value_idx = 0;
645 self.container_idx += 1;
646 }
647 }
648}
649
650impl<'a> RoaringIterator<'a> {
651 #[inline]
654 fn decode_container_into(container: &Container, output: &mut Vec<u16>) -> usize {
655 match container {
656 Container::Array(arr) => {
657 let len = arr.len();
658 if output.len() < len {
659 output.resize(len, 0);
660 }
661 output[..len].copy_from_slice(arr);
662 len
663 }
664 Container::Bitmap(bm) => Self::decode_bitmap_into(bm, output),
665 Container::Runs(runs) => {
666 let mut count = 0;
667 for &(start, len) in runs {
668 for i in 0..=len {
669 let val = start + i;
670 if count >= output.len() {
671 output.push(val);
672 } else {
673 output[count] = val;
674 }
675 count += 1;
676 }
677 }
678 count
679 }
680 }
681 }
682
683 #[inline]
686 fn decode_bitmap_into(bm: &[u64; 1024], output: &mut Vec<u16>) -> usize {
687 let mut count = 0;
688
689 for (word_idx, &word) in bm.iter().enumerate() {
690 if word == 0 {
691 continue;
692 }
693
694 let base = (word_idx * 64) as u16;
695 let mut w = word;
696
697 while w != 0 {
699 let bit_idx = w.trailing_zeros() as u16;
700 let val = base + bit_idx;
701
702 if count >= output.len() {
703 output.push(val);
704 } else {
705 output[count] = val;
706 }
707 count += 1;
708 w &= w - 1; }
710 }
711
712 count
713 }
714}
715
716pub const ROARING_BLOCK_SIZE: usize = 65536;
718
719#[derive(Debug, Clone)]
721pub struct RoaringBlockInfo {
722 pub container_key: u16,
724 pub first_doc_id: u32,
726 pub last_doc_id: u32,
728 pub max_tf: u32,
730 pub max_block_score: f32,
732 pub num_docs: u32,
734}
735
736impl RoaringBlockInfo {
737 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
739 writer.write_u16::<LittleEndian>(self.container_key)?;
740 writer.write_u32::<LittleEndian>(self.first_doc_id)?;
741 writer.write_u32::<LittleEndian>(self.last_doc_id)?;
742 writer.write_u32::<LittleEndian>(self.max_tf)?;
743 writer.write_f32::<LittleEndian>(self.max_block_score)?;
744 writer.write_u32::<LittleEndian>(self.num_docs)?;
745 Ok(())
746 }
747
748 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
750 Ok(Self {
751 container_key: reader.read_u16::<LittleEndian>()?,
752 first_doc_id: reader.read_u32::<LittleEndian>()?,
753 last_doc_id: reader.read_u32::<LittleEndian>()?,
754 max_tf: reader.read_u32::<LittleEndian>()?,
755 max_block_score: reader.read_f32::<LittleEndian>()?,
756 num_docs: reader.read_u32::<LittleEndian>()?,
757 })
758 }
759}
760
761#[derive(Debug, Clone)]
763pub struct RoaringPostingList {
764 pub doc_ids: RoaringBitmap,
766 pub term_freqs: Vec<(u32, u32)>,
769 pub default_tf: u32,
771 pub max_tf: u32,
773 pub blocks: Vec<RoaringBlockInfo>,
775 pub max_score: f32,
777}
778
779impl RoaringPostingList {
780 const K1: f32 = 1.2;
782 const B: f32 = 0.75;
783
784 #[inline]
786 pub fn compute_bm25_upper_bound(max_tf: u32, idf: f32) -> f32 {
787 let tf = max_tf as f32;
788 let min_length_norm = 1.0 - Self::B;
790 let tf_norm = (tf * (Self::K1 + 1.0)) / (tf + Self::K1 * min_length_norm);
791 idf * tf_norm
792 }
793
794 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
796 Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
797 }
798
799 pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
801 assert_eq!(doc_ids.len(), term_freqs.len());
802
803 if doc_ids.is_empty() {
804 return Self {
805 doc_ids: RoaringBitmap::new(),
806 term_freqs: Vec::new(),
807 default_tf: 1,
808 max_tf: 0,
809 blocks: Vec::new(),
810 max_score: 0.0,
811 };
812 }
813
814 let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
815
816 let mut tf_counts = std::collections::HashMap::new();
818 for &tf in term_freqs {
819 *tf_counts.entry(tf).or_insert(0u32) += 1;
820 }
821 let default_tf = tf_counts
822 .iter()
823 .max_by_key(|&(_, count)| count)
824 .map(|(&tf, _)| tf)
825 .unwrap_or(1);
826
827 let exceptions: Vec<(u32, u32)> = doc_ids
829 .iter()
830 .zip(term_freqs.iter())
831 .filter(|&(_, &tf)| tf != default_tf)
832 .map(|(&doc, &tf)| (doc, tf))
833 .collect();
834
835 let max_tf = *term_freqs.iter().max().unwrap_or(&1);
836
837 let mut blocks = Vec::new();
840 let mut max_score = 0.0f32;
841 let mut i = 0;
842
843 while i < doc_ids.len() {
844 let container_key = (doc_ids[i] >> 16) as u16;
845 let block_start = i;
846
847 while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
849 i += 1;
850 }
851
852 let block_doc_ids = &doc_ids[block_start..i];
853 let block_tfs = &term_freqs[block_start..i];
854 let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
855 let block_score = Self::compute_bm25_upper_bound(block_max_tf, idf);
856 max_score = max_score.max(block_score);
857
858 blocks.push(RoaringBlockInfo {
859 container_key,
860 first_doc_id: block_doc_ids[0],
861 last_doc_id: *block_doc_ids.last().unwrap(),
862 max_tf: block_max_tf,
863 max_block_score: block_score,
864 num_docs: block_doc_ids.len() as u32,
865 });
866 }
867
868 Self {
869 doc_ids: bitmap,
870 term_freqs: exceptions,
871 default_tf,
872 max_tf,
873 blocks,
874 max_score,
875 }
876 }
877
878 pub fn get_tf(&self, doc_id: u32) -> u32 {
880 match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
882 Ok(idx) => self.term_freqs[idx].1,
883 Err(_) => self.default_tf,
884 }
885 }
886
887 pub fn len(&self) -> u32 {
889 self.doc_ids.cardinality()
890 }
891
892 pub fn is_empty(&self) -> bool {
894 self.doc_ids.is_empty()
895 }
896
897 pub fn num_blocks(&self) -> usize {
899 self.blocks.len()
900 }
901
902 pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
904 let container_key = (doc_id >> 16) as u16;
905 self.blocks
906 .binary_search_by_key(&container_key, |b| b.container_key)
907 .ok()
908 }
909
910 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
912 self.doc_ids.serialize(writer)?;
913 writer.write_u32::<LittleEndian>(self.default_tf)?;
914 writer.write_u32::<LittleEndian>(self.max_tf)?;
915 writer.write_f32::<LittleEndian>(self.max_score)?;
916 writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
917 for &(doc, tf) in &self.term_freqs {
918 writer.write_u32::<LittleEndian>(doc)?;
919 writer.write_u32::<LittleEndian>(tf)?;
920 }
921
922 writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
924 for block in &self.blocks {
925 block.serialize(writer)?;
926 }
927
928 Ok(())
929 }
930
931 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
933 let doc_ids = RoaringBitmap::deserialize(reader)?;
934 let default_tf = reader.read_u32::<LittleEndian>()?;
935 let max_tf = reader.read_u32::<LittleEndian>()?;
936 let max_score = reader.read_f32::<LittleEndian>()?;
937 let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
938 let mut term_freqs = Vec::with_capacity(num_exceptions);
939 for _ in 0..num_exceptions {
940 let doc = reader.read_u32::<LittleEndian>()?;
941 let tf = reader.read_u32::<LittleEndian>()?;
942 term_freqs.push((doc, tf));
943 }
944
945 let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
947 let mut blocks = Vec::with_capacity(num_blocks);
948 for _ in 0..num_blocks {
949 blocks.push(RoaringBlockInfo::deserialize(reader)?);
950 }
951
952 Ok(Self {
953 doc_ids,
954 term_freqs,
955 default_tf,
956 max_tf,
957 blocks,
958 max_score,
959 })
960 }
961
962 pub fn iterator(&self) -> RoaringPostingIterator<'_> {
964 RoaringPostingIterator {
965 list: self,
966 doc_iter: self.doc_ids.iter(),
967 current_doc: None,
968 current_block: 0,
969 }
970 }
971}
972
973pub struct RoaringPostingIterator<'a> {
975 list: &'a RoaringPostingList,
976 doc_iter: RoaringIterator<'a>,
977 current_doc: Option<u32>,
978 current_block: usize,
979}
980
981impl<'a> RoaringPostingIterator<'a> {
982 pub fn doc(&self) -> u32 {
984 self.current_doc.unwrap_or(u32::MAX)
985 }
986
987 pub fn term_freq(&self) -> u32 {
989 match self.current_doc {
990 Some(doc) => self.list.get_tf(doc),
991 None => 0,
992 }
993 }
994
995 pub fn advance(&mut self) -> u32 {
997 self.current_doc = self.doc_iter.next();
998 if let Some(doc) = self.current_doc
1000 && !self.list.blocks.is_empty()
1001 {
1002 let container_key = (doc >> 16) as u16;
1003 while self.current_block < self.list.blocks.len()
1005 && self.list.blocks[self.current_block].container_key < container_key
1006 {
1007 self.current_block += 1;
1008 }
1009 }
1010 self.doc()
1011 }
1012
1013 pub fn init(&mut self) {
1015 self.current_doc = self.doc_iter.next();
1016 self.current_block = 0;
1017 }
1018
1019 pub fn seek(&mut self, target: u32) -> u32 {
1021 if !self.list.blocks.is_empty() {
1023 let target_container = (target >> 16) as u16;
1024
1025 while self.current_block < self.list.blocks.len()
1027 && self.list.blocks[self.current_block].last_doc_id < target
1028 {
1029 self.current_block += 1;
1030 }
1031
1032 if self.current_block >= self.list.blocks.len() {
1034 self.current_doc = None;
1035 return u32::MAX;
1036 }
1037
1038 let block = &self.list.blocks[self.current_block];
1040 if block.container_key > target_container
1041 || (block.container_key == target_container && block.first_doc_id > self.doc())
1042 {
1043 while let Some(doc) = self.current_doc {
1045 if doc >= block.first_doc_id {
1046 break;
1047 }
1048 self.current_doc = self.doc_iter.next();
1049 }
1050 }
1051 }
1052
1053 while let Some(doc) = self.current_doc {
1055 if doc >= target {
1056 return doc;
1057 }
1058 self.current_doc = self.doc_iter.next();
1059 }
1060 u32::MAX
1061 }
1062
1063 pub fn is_exhausted(&self) -> bool {
1065 self.current_doc.is_none()
1066 }
1067
1068 pub fn current_block_max_score(&self) -> f32 {
1070 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1071 return 0.0;
1072 }
1073 if self.current_block < self.list.blocks.len() {
1074 self.list.blocks[self.current_block].max_block_score
1075 } else {
1076 0.0
1077 }
1078 }
1079
1080 pub fn current_block_max_tf(&self) -> u32 {
1082 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1083 return 0;
1084 }
1085 if self.current_block < self.list.blocks.len() {
1086 self.list.blocks[self.current_block].max_tf
1087 } else {
1088 0
1089 }
1090 }
1091
1092 pub fn max_remaining_score(&self) -> f32 {
1094 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1095 return 0.0;
1096 }
1097 self.list.blocks[self.current_block..]
1098 .iter()
1099 .map(|b| b.max_block_score)
1100 .fold(0.0f32, |a, b| a.max(b))
1101 }
1102
1103 pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1106 if self.list.blocks.is_empty() {
1107 return None;
1108 }
1109
1110 while self.current_block < self.list.blocks.len() {
1111 let block = &self.list.blocks[self.current_block];
1112 if block.last_doc_id >= target {
1113 while let Some(doc) = self.current_doc {
1115 if doc >= block.first_doc_id {
1116 break;
1117 }
1118 self.current_doc = self.doc_iter.next();
1119 }
1120 return Some((block.first_doc_id, block.max_block_score));
1121 }
1122 self.current_block += 1;
1123 }
1124
1125 self.current_doc = None;
1126 None
1127 }
1128}
1129
1130#[cfg(test)]
1131mod tests {
1132 use super::*;
1133
1134 #[test]
1135 fn test_roaring_basic() {
1136 let mut bitmap = RoaringBitmap::new();
1137 bitmap.insert(1);
1138 bitmap.insert(100);
1139 bitmap.insert(1000);
1140 bitmap.insert(100000);
1141
1142 assert!(bitmap.contains(1));
1143 assert!(bitmap.contains(100));
1144 assert!(bitmap.contains(1000));
1145 assert!(bitmap.contains(100000));
1146 assert!(!bitmap.contains(2));
1147 assert!(!bitmap.contains(50000));
1148
1149 assert_eq!(bitmap.cardinality(), 4);
1150 }
1151
1152 #[test]
1153 fn test_roaring_from_sorted() {
1154 let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1155 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1156
1157 assert_eq!(bitmap.cardinality(), 10000);
1158
1159 for &val in &values {
1160 assert!(bitmap.contains(val), "Missing value {}", val);
1161 }
1162 }
1163
1164 #[test]
1165 fn test_roaring_intersection() {
1166 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1167 let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1168
1169 let result = a.and(&b);
1170
1171 assert_eq!(result.cardinality(), 4);
1172 assert!(result.contains(2));
1173 assert!(result.contains(3));
1174 assert!(result.contains(200));
1175 assert!(result.contains(300));
1176 }
1177
1178 #[test]
1179 fn test_roaring_union() {
1180 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1181 let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1182
1183 let result = a.or(&b);
1184
1185 assert_eq!(result.cardinality(), 5);
1186 for i in 1..=5 {
1187 assert!(result.contains(i));
1188 }
1189 }
1190
1191 #[test]
1192 fn test_roaring_serialization() {
1193 let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1194 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1195
1196 let mut buffer = Vec::new();
1197 bitmap.serialize(&mut buffer).unwrap();
1198
1199 let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1200
1201 assert_eq!(restored.cardinality(), bitmap.cardinality());
1202 for &val in &values {
1203 assert!(restored.contains(val));
1204 }
1205 }
1206
1207 #[test]
1208 fn test_roaring_posting_list() {
1209 let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1210 let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1211
1212 let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1213
1214 assert_eq!(list.len(), 7);
1215 assert_eq!(list.default_tf, 1);
1216 assert_eq!(list.max_tf, 3);
1217
1218 assert_eq!(list.get_tf(1), 1);
1220 assert_eq!(list.get_tf(10), 2);
1221 assert_eq!(list.get_tf(100), 3);
1222 }
1223
1224 #[test]
1225 fn test_roaring_iterator() {
1226 let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1227 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1228
1229 let collected: Vec<u32> = bitmap.iter().collect();
1230 assert_eq!(collected, values);
1231 }
1232
1233 #[test]
1234 fn test_roaring_block_max() {
1235 let mut doc_ids = Vec::new();
1240 let mut term_freqs = Vec::new();
1241
1242 for i in 0..100 {
1244 doc_ids.push(i * 100);
1245 term_freqs.push(if i == 50 { 2 } else { 1 });
1246 }
1247
1248 for i in 0..100 {
1250 doc_ids.push(65536 + i * 100);
1251 term_freqs.push(if i == 25 { 5 } else { 1 });
1252 }
1253
1254 for i in 0..100 {
1256 doc_ids.push(131072 + i * 100);
1257 term_freqs.push(if i == 75 { 3 } else { 1 });
1258 }
1259
1260 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1261
1262 assert_eq!(list.num_blocks(), 3);
1264 assert_eq!(list.blocks[0].container_key, 0);
1265 assert_eq!(list.blocks[1].container_key, 1);
1266 assert_eq!(list.blocks[2].container_key, 2);
1267
1268 assert_eq!(list.blocks[0].max_tf, 2);
1269 assert_eq!(list.blocks[1].max_tf, 5);
1270 assert_eq!(list.blocks[2].max_tf, 3);
1271
1272 assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1274 assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1275
1276 assert_eq!(list.max_score, list.blocks[1].max_block_score);
1278
1279 let mut iter = list.iterator();
1281 iter.init();
1282 assert_eq!(iter.current_block_max_tf(), 2); iter.seek(65536);
1286 assert_eq!(iter.current_block_max_tf(), 5);
1287
1288 iter.seek(131072);
1290 assert_eq!(iter.current_block_max_tf(), 3);
1291 }
1292
1293 #[test]
1294 fn test_roaring_block_max_serialization() {
1295 let mut doc_ids = Vec::new();
1296 let mut term_freqs = Vec::new();
1297
1298 for i in 0..50 {
1300 doc_ids.push(i * 10);
1301 term_freqs.push((i % 5) + 1);
1302 }
1303 for i in 0..50 {
1304 doc_ids.push(65536 + i * 10);
1305 term_freqs.push((i % 3) + 1);
1306 }
1307
1308 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1309
1310 let mut buffer = Vec::new();
1311 list.serialize(&mut buffer).unwrap();
1312
1313 let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1314
1315 assert_eq!(restored.len(), list.len());
1316 assert_eq!(restored.max_tf, list.max_tf);
1317 assert_eq!(restored.max_score, list.max_score);
1318 assert_eq!(restored.num_blocks(), list.num_blocks());
1319
1320 for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1322 assert_eq!(orig.container_key, rest.container_key);
1323 assert_eq!(orig.first_doc_id, rest.first_doc_id);
1324 assert_eq!(orig.last_doc_id, rest.last_doc_id);
1325 assert_eq!(orig.max_tf, rest.max_tf);
1326 assert_eq!(orig.max_block_score, rest.max_block_score);
1327 }
1328
1329 let mut iter1 = list.iterator();
1331 let mut iter2 = restored.iterator();
1332 iter1.init();
1333 iter2.init();
1334
1335 while !iter1.is_exhausted() {
1336 assert_eq!(iter1.doc(), iter2.doc());
1337 assert_eq!(iter1.term_freq(), iter2.term_freq());
1338 iter1.advance();
1339 iter2.advance();
1340 }
1341 assert!(iter2.is_exhausted());
1342 }
1343}