1use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
12use std::io::{self, Read, Write};
13
14const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
17
18#[derive(Debug, Clone)]
20enum Container {
21 Array(Vec<u16>),
23 Bitmap(Box<[u64; 1024]>),
25 Runs(Vec<(u16, u16)>),
27}
28
29impl Container {
30 fn new_array() -> Self {
31 Container::Array(Vec::new())
32 }
33
34 #[allow(dead_code)]
35 fn new_bitmap() -> Self {
36 Container::Bitmap(Box::new([0u64; 1024]))
37 }
38
39 fn cardinality(&self) -> u32 {
40 match self {
41 Container::Array(arr) => arr.len() as u32,
42 Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
43 Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
44 }
45 }
46
47 #[inline]
49 fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
50 #[cfg(target_arch = "aarch64")]
51 {
52 use std::arch::aarch64::*;
53
54 let mut total = 0u32;
55 let bytes = bm.as_ptr() as *const u8;
56
57 unsafe {
59 for i in (0..8192).step_by(16) {
60 let v = vld1q_u8(bytes.add(i));
61 let cnt = vcntq_u8(v);
62 total += vaddlvq_u8(cnt) as u32;
63 }
64 }
65
66 total
67 }
68
69 #[cfg(target_arch = "x86_64")]
70 {
71 let mut total = 0u32;
73 for &word in bm.iter() {
74 total += word.count_ones();
75 }
76 total
77 }
78
79 #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
80 {
81 bm.iter().map(|w| w.count_ones()).sum()
82 }
83 }
84
85 fn contains(&self, val: u16) -> bool {
86 match self {
87 Container::Array(arr) => arr.binary_search(&val).is_ok(),
88 Container::Bitmap(bm) => {
89 let word_idx = (val / 64) as usize;
90 let bit_idx = val % 64;
91 (bm[word_idx] >> bit_idx) & 1 == 1
92 }
93 Container::Runs(runs) => {
94 for &(start, len) in runs {
95 if val >= start && val <= start + len {
96 return true;
97 }
98 if val < start {
99 return false;
100 }
101 }
102 false
103 }
104 }
105 }
106
107 fn insert(&mut self, val: u16) -> bool {
108 match self {
109 Container::Array(arr) => {
110 match arr.binary_search(&val) {
111 Ok(_) => false, Err(pos) => {
113 arr.insert(pos, val);
114 true
115 }
116 }
117 }
118 Container::Bitmap(bm) => {
119 let word_idx = (val / 64) as usize;
120 let bit_idx = val % 64;
121 let old = bm[word_idx];
122 bm[word_idx] |= 1u64 << bit_idx;
123 old != bm[word_idx]
124 }
125 Container::Runs(runs) => {
126 let mut arr: Vec<u16> = Vec::new();
128 for &(start, len) in runs.iter() {
129 for i in 0..=len {
130 arr.push(start + i);
131 }
132 }
133 let inserted = match arr.binary_search(&val) {
134 Ok(_) => false,
135 Err(pos) => {
136 arr.insert(pos, val);
137 true
138 }
139 };
140 *self = Container::Array(arr);
141 inserted
142 }
143 }
144 }
145
146 fn optimize(&mut self) {
147 let card = self.cardinality() as usize;
148
149 match self {
150 Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
151 let mut bm = Box::new([0u64; 1024]);
153 for &val in arr.iter() {
154 let word_idx = (val / 64) as usize;
155 let bit_idx = val % 64;
156 bm[word_idx] |= 1u64 << bit_idx;
157 }
158 *self = Container::Bitmap(bm);
159 }
160 Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
161 let mut arr = Vec::with_capacity(card);
163 for (word_idx, &word) in bm.iter().enumerate() {
164 let mut w = word;
165 while w != 0 {
166 let bit_idx = w.trailing_zeros();
167 arr.push((word_idx * 64 + bit_idx as usize) as u16);
168 w &= w - 1;
169 }
170 }
171 *self = Container::Array(arr);
172 }
173 _ => {}
174 }
175
176 self.try_run_encode();
178 }
179
180 fn try_run_encode(&mut self) {
181 let arr = match self {
182 Container::Array(arr) if arr.len() >= 4 => arr,
183 _ => return,
184 };
185
186 let mut runs = Vec::new();
187 let mut i = 0;
188
189 while i < arr.len() {
190 let start = arr[i];
191 let mut end = start;
192
193 while i + 1 < arr.len() && arr[i + 1] == end + 1 {
194 end = arr[i + 1];
195 i += 1;
196 }
197
198 runs.push((start, end - start));
199 i += 1;
200 }
201
202 if runs.len() * 4 < arr.len() * 2 {
206 *self = Container::Runs(runs);
207 }
208 }
209
210 fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
211 match self {
212 Container::Array(arr) => {
213 writer.write_u8(0)?; writer.write_u16::<LittleEndian>(arr.len() as u16)?;
215 for &val in arr {
216 writer.write_u16::<LittleEndian>(val)?;
217 }
218 }
219 Container::Bitmap(bm) => {
220 writer.write_u8(1)?; for &word in bm.iter() {
222 writer.write_u64::<LittleEndian>(word)?;
223 }
224 }
225 Container::Runs(runs) => {
226 writer.write_u8(2)?; writer.write_u16::<LittleEndian>(runs.len() as u16)?;
228 for &(start, len) in runs {
229 writer.write_u16::<LittleEndian>(start)?;
230 writer.write_u16::<LittleEndian>(len)?;
231 }
232 }
233 }
234 Ok(())
235 }
236
237 fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
238 let tag = reader.read_u8()?;
239 match tag {
240 0 => {
241 let len = reader.read_u16::<LittleEndian>()? as usize;
242 let mut arr = Vec::with_capacity(len);
243 for _ in 0..len {
244 arr.push(reader.read_u16::<LittleEndian>()?);
245 }
246 Ok(Container::Array(arr))
247 }
248 1 => {
249 let mut bm = Box::new([0u64; 1024]);
250 for word in bm.iter_mut() {
251 *word = reader.read_u64::<LittleEndian>()?;
252 }
253 Ok(Container::Bitmap(bm))
254 }
255 2 => {
256 let len = reader.read_u16::<LittleEndian>()? as usize;
257 let mut runs = Vec::with_capacity(len);
258 for _ in 0..len {
259 let start = reader.read_u16::<LittleEndian>()?;
260 let run_len = reader.read_u16::<LittleEndian>()?;
261 runs.push((start, run_len));
262 }
263 Ok(Container::Runs(runs))
264 }
265 _ => Err(io::Error::new(
266 io::ErrorKind::InvalidData,
267 "Invalid container type",
268 )),
269 }
270 }
271
272 fn size_bytes(&self) -> usize {
273 match self {
274 Container::Array(arr) => arr.len() * 2 + 4,
275 Container::Bitmap(_) => 8 * 1024 + 1,
276 Container::Runs(runs) => runs.len() * 4 + 4,
277 }
278 }
279}
280
281#[derive(Debug, Clone)]
283pub struct RoaringBitmap {
284 containers: Vec<(u16, Container)>,
286}
287
288impl RoaringBitmap {
289 pub fn new() -> Self {
291 Self {
292 containers: Vec::new(),
293 }
294 }
295
296 pub fn from_sorted_slice(values: &[u32]) -> Self {
298 let mut bitmap = Self::new();
299
300 if values.is_empty() {
301 return bitmap;
302 }
303
304 let mut current_high = (values[0] >> 16) as u16;
305 let mut current_container = Container::new_array();
306
307 for &val in values {
308 let high = (val >> 16) as u16;
309 let low = val as u16;
310
311 if high != current_high {
312 current_container.optimize();
313 bitmap.containers.push((current_high, current_container));
314 current_high = high;
315 current_container = Container::new_array();
316 }
317
318 current_container.insert(low);
319 }
320
321 current_container.optimize();
322 bitmap.containers.push((current_high, current_container));
323
324 bitmap
325 }
326
327 pub fn insert(&mut self, val: u32) -> bool {
329 let high = (val >> 16) as u16;
330 let low = val as u16;
331
332 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
334 Ok(idx) => self.containers[idx].1.insert(low),
335 Err(idx) => {
336 let mut container = Container::new_array();
337 container.insert(low);
338 self.containers.insert(idx, (high, container));
339 true
340 }
341 }
342 }
343
344 pub fn contains(&self, val: u32) -> bool {
346 let high = (val >> 16) as u16;
347 let low = val as u16;
348
349 match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
350 Ok(idx) => self.containers[idx].1.contains(low),
351 Err(_) => false,
352 }
353 }
354
355 pub fn cardinality(&self) -> u32 {
357 self.containers.iter().map(|(_, c)| c.cardinality()).sum()
358 }
359
360 pub fn is_empty(&self) -> bool {
362 self.containers.is_empty()
363 }
364
365 pub fn optimize(&mut self) {
367 for (_, container) in &mut self.containers {
368 container.optimize();
369 }
370 }
371
372 pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
374 let mut result = RoaringBitmap::new();
375
376 let mut i = 0;
377 let mut j = 0;
378
379 while i < self.containers.len() && j < other.containers.len() {
380 let (high1, c1) = &self.containers[i];
381 let (high2, c2) = &other.containers[j];
382
383 match high1.cmp(high2) {
384 std::cmp::Ordering::Less => i += 1,
385 std::cmp::Ordering::Greater => j += 1,
386 std::cmp::Ordering::Equal => {
387 let intersected = Self::intersect_containers(c1, c2);
388 if intersected.cardinality() > 0 {
389 result.containers.push((*high1, intersected));
390 }
391 i += 1;
392 j += 1;
393 }
394 }
395 }
396
397 result
398 }
399
400 fn intersect_containers(c1: &Container, c2: &Container) -> Container {
401 match (c1, c2) {
402 (Container::Array(a1), Container::Array(a2)) => {
403 let mut result = Vec::new();
404 let mut i = 0;
405 let mut j = 0;
406 while i < a1.len() && j < a2.len() {
407 match a1[i].cmp(&a2[j]) {
408 std::cmp::Ordering::Less => i += 1,
409 std::cmp::Ordering::Greater => j += 1,
410 std::cmp::Ordering::Equal => {
411 result.push(a1[i]);
412 i += 1;
413 j += 1;
414 }
415 }
416 }
417 Container::Array(result)
418 }
419 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
420 let mut result = Box::new([0u64; 1024]);
421 for i in 0..1024 {
422 result[i] = b1[i] & b2[i];
423 }
424 let mut c = Container::Bitmap(result);
425 c.optimize();
426 c
427 }
428 (Container::Array(arr), Container::Bitmap(bm))
429 | (Container::Bitmap(bm), Container::Array(arr)) => {
430 let mut result = Vec::new();
431 for &val in arr {
432 let word_idx = (val / 64) as usize;
433 let bit_idx = val % 64;
434 if (bm[word_idx] >> bit_idx) & 1 == 1 {
435 result.push(val);
436 }
437 }
438 Container::Array(result)
439 }
440 _ => {
441 let arr1 = Self::container_to_array(c1);
443 let arr2 = Self::container_to_array(c2);
444 Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
445 }
446 }
447 }
448
449 fn container_to_array(c: &Container) -> Vec<u16> {
450 match c {
451 Container::Array(arr) => arr.clone(),
452 Container::Bitmap(bm) => {
453 let mut arr = Vec::new();
454 for (word_idx, &word) in bm.iter().enumerate() {
455 let mut w = word;
456 while w != 0 {
457 let bit_idx = w.trailing_zeros();
458 arr.push((word_idx * 64 + bit_idx as usize) as u16);
459 w &= w - 1;
460 }
461 }
462 arr
463 }
464 Container::Runs(runs) => {
465 let mut arr = Vec::new();
466 for &(start, len) in runs {
467 for i in 0..=len {
468 arr.push(start + i);
469 }
470 }
471 arr
472 }
473 }
474 }
475
476 pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
478 let mut result = RoaringBitmap::new();
479
480 let mut i = 0;
481 let mut j = 0;
482
483 while i < self.containers.len() || j < other.containers.len() {
484 if i >= self.containers.len() {
485 result.containers.push(other.containers[j].clone());
486 j += 1;
487 } else if j >= other.containers.len() {
488 result.containers.push(self.containers[i].clone());
489 i += 1;
490 } else {
491 let (high1, c1) = &self.containers[i];
492 let (high2, c2) = &other.containers[j];
493
494 match high1.cmp(high2) {
495 std::cmp::Ordering::Less => {
496 result.containers.push(self.containers[i].clone());
497 i += 1;
498 }
499 std::cmp::Ordering::Greater => {
500 result.containers.push(other.containers[j].clone());
501 j += 1;
502 }
503 std::cmp::Ordering::Equal => {
504 let united = Self::union_containers(c1, c2);
505 result.containers.push((*high1, united));
506 i += 1;
507 j += 1;
508 }
509 }
510 }
511 }
512
513 result
514 }
515
516 fn union_containers(c1: &Container, c2: &Container) -> Container {
517 match (c1, c2) {
518 (Container::Bitmap(b1), Container::Bitmap(b2)) => {
519 let mut result = Box::new([0u64; 1024]);
520 for i in 0..1024 {
521 result[i] = b1[i] | b2[i];
522 }
523 Container::Bitmap(result)
524 }
525 _ => {
526 let arr1 = Self::container_to_array(c1);
528 let arr2 = Self::container_to_array(c2);
529
530 let mut result = Vec::with_capacity(arr1.len() + arr2.len());
531 let mut i = 0;
532 let mut j = 0;
533
534 while i < arr1.len() && j < arr2.len() {
535 match arr1[i].cmp(&arr2[j]) {
536 std::cmp::Ordering::Less => {
537 result.push(arr1[i]);
538 i += 1;
539 }
540 std::cmp::Ordering::Greater => {
541 result.push(arr2[j]);
542 j += 1;
543 }
544 std::cmp::Ordering::Equal => {
545 result.push(arr1[i]);
546 i += 1;
547 j += 1;
548 }
549 }
550 }
551
552 result.extend_from_slice(&arr1[i..]);
553 result.extend_from_slice(&arr2[j..]);
554
555 let mut c = Container::Array(result);
556 c.optimize();
557 c
558 }
559 }
560 }
561
562 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
564 writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
565 for (high, container) in &self.containers {
566 writer.write_u16::<LittleEndian>(*high)?;
567 container.serialize(writer)?;
568 }
569 Ok(())
570 }
571
572 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
574 let num_containers = reader.read_u32::<LittleEndian>()? as usize;
575 let mut containers = Vec::with_capacity(num_containers);
576
577 for _ in 0..num_containers {
578 let high = reader.read_u16::<LittleEndian>()?;
579 let container = Container::deserialize(reader)?;
580 containers.push((high, container));
581 }
582
583 Ok(Self { containers })
584 }
585
586 pub fn size_bytes(&self) -> usize {
588 4 + self
589 .containers
590 .iter()
591 .map(|(_, c)| 2 + c.size_bytes())
592 .sum::<usize>()
593 }
594
595 pub fn iter(&self) -> RoaringIterator<'_> {
597 RoaringIterator {
599 bitmap: self,
600 container_idx: 0,
601 value_idx: 0,
602 current_len: 0,
603 current_values: Vec::with_capacity(ARRAY_TO_BITMAP_THRESHOLD),
604 }
605 }
606}
607
608impl Default for RoaringBitmap {
609 fn default() -> Self {
610 Self::new()
611 }
612}
613
614pub struct RoaringIterator<'a> {
616 bitmap: &'a RoaringBitmap,
617 container_idx: usize,
618 value_idx: usize,
619 current_len: usize,
621 current_values: Vec<u16>,
623}
624
625impl<'a> Iterator for RoaringIterator<'a> {
626 type Item = u32;
627
628 fn next(&mut self) -> Option<Self::Item> {
629 loop {
630 if self.value_idx < self.current_len {
631 let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
632 let low = self.current_values[self.value_idx] as u32;
633 self.value_idx += 1;
634 return Some((high << 16) | low);
635 }
636
637 if self.container_idx >= self.bitmap.containers.len() {
638 return None;
639 }
640
641 let (_, container) = &self.bitmap.containers[self.container_idx];
642 self.current_len = Self::decode_container_into(container, &mut self.current_values);
644 self.value_idx = 0;
645 self.container_idx += 1;
646 }
647 }
648}
649
650impl<'a> RoaringIterator<'a> {
651 #[inline]
654 fn decode_container_into(container: &Container, output: &mut Vec<u16>) -> usize {
655 match container {
656 Container::Array(arr) => {
657 let len = arr.len();
658 if output.len() < len {
659 output.resize(len, 0);
660 }
661 output[..len].copy_from_slice(arr);
662 len
663 }
664 Container::Bitmap(bm) => Self::decode_bitmap_into(bm, output),
665 Container::Runs(runs) => {
666 let mut count = 0;
667 for &(start, len) in runs {
668 for i in 0..=len {
669 let val = start + i;
670 if count >= output.len() {
671 output.push(val);
672 } else {
673 output[count] = val;
674 }
675 count += 1;
676 }
677 }
678 count
679 }
680 }
681 }
682
683 #[inline]
686 fn decode_bitmap_into(bm: &[u64; 1024], output: &mut Vec<u16>) -> usize {
687 let mut count = 0;
688
689 for (word_idx, &word) in bm.iter().enumerate() {
690 if word == 0 {
691 continue;
692 }
693
694 let base = (word_idx * 64) as u16;
695 let mut w = word;
696
697 while w != 0 {
699 let bit_idx = w.trailing_zeros() as u16;
700 let val = base + bit_idx;
701
702 if count >= output.len() {
703 output.push(val);
704 } else {
705 output[count] = val;
706 }
707 count += 1;
708 w &= w - 1; }
710 }
711
712 count
713 }
714}
715
716pub const ROARING_BLOCK_SIZE: usize = 65536;
718
719#[derive(Debug, Clone)]
721pub struct RoaringBlockInfo {
722 pub container_key: u16,
724 pub first_doc_id: u32,
726 pub last_doc_id: u32,
728 pub max_tf: u32,
730 pub max_block_score: f32,
732 pub num_docs: u32,
734}
735
736impl RoaringBlockInfo {
737 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
739 writer.write_u16::<LittleEndian>(self.container_key)?;
740 writer.write_u32::<LittleEndian>(self.first_doc_id)?;
741 writer.write_u32::<LittleEndian>(self.last_doc_id)?;
742 writer.write_u32::<LittleEndian>(self.max_tf)?;
743 writer.write_f32::<LittleEndian>(self.max_block_score)?;
744 writer.write_u32::<LittleEndian>(self.num_docs)?;
745 Ok(())
746 }
747
748 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
750 Ok(Self {
751 container_key: reader.read_u16::<LittleEndian>()?,
752 first_doc_id: reader.read_u32::<LittleEndian>()?,
753 last_doc_id: reader.read_u32::<LittleEndian>()?,
754 max_tf: reader.read_u32::<LittleEndian>()?,
755 max_block_score: reader.read_f32::<LittleEndian>()?,
756 num_docs: reader.read_u32::<LittleEndian>()?,
757 })
758 }
759}
760
761#[derive(Debug, Clone)]
763pub struct RoaringPostingList {
764 pub doc_ids: RoaringBitmap,
766 pub term_freqs: Vec<(u32, u32)>,
769 pub default_tf: u32,
771 pub max_tf: u32,
773 pub blocks: Vec<RoaringBlockInfo>,
775 pub max_score: f32,
777}
778
779impl RoaringPostingList {
780 pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
782 Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
783 }
784
785 pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
787 assert_eq!(doc_ids.len(), term_freqs.len());
788
789 if doc_ids.is_empty() {
790 return Self {
791 doc_ids: RoaringBitmap::new(),
792 term_freqs: Vec::new(),
793 default_tf: 1,
794 max_tf: 0,
795 blocks: Vec::new(),
796 max_score: 0.0,
797 };
798 }
799
800 let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
801
802 let mut tf_counts = std::collections::HashMap::new();
804 for &tf in term_freqs {
805 *tf_counts.entry(tf).or_insert(0u32) += 1;
806 }
807 let default_tf = tf_counts
808 .iter()
809 .max_by_key(|&(_, count)| count)
810 .map(|(&tf, _)| tf)
811 .unwrap_or(1);
812
813 let exceptions: Vec<(u32, u32)> = doc_ids
815 .iter()
816 .zip(term_freqs.iter())
817 .filter(|&(_, &tf)| tf != default_tf)
818 .map(|(&doc, &tf)| (doc, tf))
819 .collect();
820
821 let max_tf = *term_freqs.iter().max().unwrap_or(&1);
822
823 let mut blocks = Vec::new();
826 let mut max_score = 0.0f32;
827 let mut i = 0;
828
829 while i < doc_ids.len() {
830 let container_key = (doc_ids[i] >> 16) as u16;
831 let block_start = i;
832
833 while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
835 i += 1;
836 }
837
838 let block_doc_ids = &doc_ids[block_start..i];
839 let block_tfs = &term_freqs[block_start..i];
840 let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
841 let block_score = crate::query::bm25_upper_bound(block_max_tf as f32, idf);
842 max_score = max_score.max(block_score);
843
844 blocks.push(RoaringBlockInfo {
845 container_key,
846 first_doc_id: block_doc_ids[0],
847 last_doc_id: *block_doc_ids.last().unwrap(),
848 max_tf: block_max_tf,
849 max_block_score: block_score,
850 num_docs: block_doc_ids.len() as u32,
851 });
852 }
853
854 Self {
855 doc_ids: bitmap,
856 term_freqs: exceptions,
857 default_tf,
858 max_tf,
859 blocks,
860 max_score,
861 }
862 }
863
864 pub fn get_tf(&self, doc_id: u32) -> u32 {
866 match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
868 Ok(idx) => self.term_freqs[idx].1,
869 Err(_) => self.default_tf,
870 }
871 }
872
873 pub fn len(&self) -> u32 {
875 self.doc_ids.cardinality()
876 }
877
878 pub fn is_empty(&self) -> bool {
880 self.doc_ids.is_empty()
881 }
882
883 pub fn num_blocks(&self) -> usize {
885 self.blocks.len()
886 }
887
888 pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
890 let container_key = (doc_id >> 16) as u16;
891 self.blocks
892 .binary_search_by_key(&container_key, |b| b.container_key)
893 .ok()
894 }
895
896 pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
898 self.doc_ids.serialize(writer)?;
899 writer.write_u32::<LittleEndian>(self.default_tf)?;
900 writer.write_u32::<LittleEndian>(self.max_tf)?;
901 writer.write_f32::<LittleEndian>(self.max_score)?;
902 writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
903 for &(doc, tf) in &self.term_freqs {
904 writer.write_u32::<LittleEndian>(doc)?;
905 writer.write_u32::<LittleEndian>(tf)?;
906 }
907
908 writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
910 for block in &self.blocks {
911 block.serialize(writer)?;
912 }
913
914 Ok(())
915 }
916
917 pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
919 let doc_ids = RoaringBitmap::deserialize(reader)?;
920 let default_tf = reader.read_u32::<LittleEndian>()?;
921 let max_tf = reader.read_u32::<LittleEndian>()?;
922 let max_score = reader.read_f32::<LittleEndian>()?;
923 let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
924 let mut term_freqs = Vec::with_capacity(num_exceptions);
925 for _ in 0..num_exceptions {
926 let doc = reader.read_u32::<LittleEndian>()?;
927 let tf = reader.read_u32::<LittleEndian>()?;
928 term_freqs.push((doc, tf));
929 }
930
931 let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
933 let mut blocks = Vec::with_capacity(num_blocks);
934 for _ in 0..num_blocks {
935 blocks.push(RoaringBlockInfo::deserialize(reader)?);
936 }
937
938 Ok(Self {
939 doc_ids,
940 term_freqs,
941 default_tf,
942 max_tf,
943 blocks,
944 max_score,
945 })
946 }
947
948 pub fn iterator(&self) -> RoaringPostingIterator<'_> {
950 RoaringPostingIterator {
951 list: self,
952 doc_iter: self.doc_ids.iter(),
953 current_doc: None,
954 current_block: 0,
955 }
956 }
957}
958
959pub struct RoaringPostingIterator<'a> {
961 list: &'a RoaringPostingList,
962 doc_iter: RoaringIterator<'a>,
963 current_doc: Option<u32>,
964 current_block: usize,
965}
966
967impl<'a> RoaringPostingIterator<'a> {
968 pub fn doc(&self) -> u32 {
970 self.current_doc.unwrap_or(u32::MAX)
971 }
972
973 pub fn term_freq(&self) -> u32 {
975 match self.current_doc {
976 Some(doc) => self.list.get_tf(doc),
977 None => 0,
978 }
979 }
980
981 pub fn advance(&mut self) -> u32 {
983 self.current_doc = self.doc_iter.next();
984 if let Some(doc) = self.current_doc
986 && !self.list.blocks.is_empty()
987 {
988 let container_key = (doc >> 16) as u16;
989 while self.current_block < self.list.blocks.len()
991 && self.list.blocks[self.current_block].container_key < container_key
992 {
993 self.current_block += 1;
994 }
995 }
996 self.doc()
997 }
998
999 pub fn init(&mut self) {
1001 self.current_doc = self.doc_iter.next();
1002 self.current_block = 0;
1003 }
1004
1005 pub fn seek(&mut self, target: u32) -> u32 {
1007 if !self.list.blocks.is_empty() {
1009 let target_container = (target >> 16) as u16;
1010
1011 while self.current_block < self.list.blocks.len()
1013 && self.list.blocks[self.current_block].last_doc_id < target
1014 {
1015 self.current_block += 1;
1016 }
1017
1018 if self.current_block >= self.list.blocks.len() {
1020 self.current_doc = None;
1021 return u32::MAX;
1022 }
1023
1024 let block = &self.list.blocks[self.current_block];
1026 if block.container_key > target_container
1027 || (block.container_key == target_container && block.first_doc_id > self.doc())
1028 {
1029 while let Some(doc) = self.current_doc {
1031 if doc >= block.first_doc_id {
1032 break;
1033 }
1034 self.current_doc = self.doc_iter.next();
1035 }
1036 }
1037 }
1038
1039 while let Some(doc) = self.current_doc {
1041 if doc >= target {
1042 return doc;
1043 }
1044 self.current_doc = self.doc_iter.next();
1045 }
1046 u32::MAX
1047 }
1048
1049 pub fn is_exhausted(&self) -> bool {
1051 self.current_doc.is_none()
1052 }
1053
1054 pub fn current_block_max_score(&self) -> f32 {
1056 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1057 return 0.0;
1058 }
1059 if self.current_block < self.list.blocks.len() {
1060 self.list.blocks[self.current_block].max_block_score
1061 } else {
1062 0.0
1063 }
1064 }
1065
1066 pub fn current_block_max_tf(&self) -> u32 {
1068 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1069 return 0;
1070 }
1071 if self.current_block < self.list.blocks.len() {
1072 self.list.blocks[self.current_block].max_tf
1073 } else {
1074 0
1075 }
1076 }
1077
1078 pub fn max_remaining_score(&self) -> f32 {
1080 if self.current_doc.is_none() || self.list.blocks.is_empty() {
1081 return 0.0;
1082 }
1083 self.list.blocks[self.current_block..]
1084 .iter()
1085 .map(|b| b.max_block_score)
1086 .fold(0.0f32, |a, b| a.max(b))
1087 }
1088
1089 pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
1092 if self.list.blocks.is_empty() {
1093 return None;
1094 }
1095
1096 while self.current_block < self.list.blocks.len() {
1097 let block = &self.list.blocks[self.current_block];
1098 if block.last_doc_id >= target {
1099 while let Some(doc) = self.current_doc {
1101 if doc >= block.first_doc_id {
1102 break;
1103 }
1104 self.current_doc = self.doc_iter.next();
1105 }
1106 return Some((block.first_doc_id, block.max_block_score));
1107 }
1108 self.current_block += 1;
1109 }
1110
1111 self.current_doc = None;
1112 None
1113 }
1114}
1115
1116#[cfg(test)]
1117mod tests {
1118 use super::*;
1119
1120 #[test]
1121 fn test_roaring_basic() {
1122 let mut bitmap = RoaringBitmap::new();
1123 bitmap.insert(1);
1124 bitmap.insert(100);
1125 bitmap.insert(1000);
1126 bitmap.insert(100000);
1127
1128 assert!(bitmap.contains(1));
1129 assert!(bitmap.contains(100));
1130 assert!(bitmap.contains(1000));
1131 assert!(bitmap.contains(100000));
1132 assert!(!bitmap.contains(2));
1133 assert!(!bitmap.contains(50000));
1134
1135 assert_eq!(bitmap.cardinality(), 4);
1136 }
1137
1138 #[test]
1139 fn test_roaring_from_sorted() {
1140 let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
1141 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1142
1143 assert_eq!(bitmap.cardinality(), 10000);
1144
1145 for &val in &values {
1146 assert!(bitmap.contains(val), "Missing value {}", val);
1147 }
1148 }
1149
1150 #[test]
1151 fn test_roaring_intersection() {
1152 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
1153 let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
1154
1155 let result = a.and(&b);
1156
1157 assert_eq!(result.cardinality(), 4);
1158 assert!(result.contains(2));
1159 assert!(result.contains(3));
1160 assert!(result.contains(200));
1161 assert!(result.contains(300));
1162 }
1163
1164 #[test]
1165 fn test_roaring_union() {
1166 let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
1167 let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
1168
1169 let result = a.or(&b);
1170
1171 assert_eq!(result.cardinality(), 5);
1172 for i in 1..=5 {
1173 assert!(result.contains(i));
1174 }
1175 }
1176
1177 #[test]
1178 fn test_roaring_serialization() {
1179 let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
1180 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1181
1182 let mut buffer = Vec::new();
1183 bitmap.serialize(&mut buffer).unwrap();
1184
1185 let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
1186
1187 assert_eq!(restored.cardinality(), bitmap.cardinality());
1188 for &val in &values {
1189 assert!(restored.contains(val));
1190 }
1191 }
1192
1193 #[test]
1194 fn test_roaring_posting_list() {
1195 let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
1196 let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
1197
1198 let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
1199
1200 assert_eq!(list.len(), 7);
1201 assert_eq!(list.default_tf, 1);
1202 assert_eq!(list.max_tf, 3);
1203
1204 assert_eq!(list.get_tf(1), 1);
1206 assert_eq!(list.get_tf(10), 2);
1207 assert_eq!(list.get_tf(100), 3);
1208 }
1209
1210 #[test]
1211 fn test_roaring_iterator() {
1212 let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
1213 let bitmap = RoaringBitmap::from_sorted_slice(&values);
1214
1215 let collected: Vec<u32> = bitmap.iter().collect();
1216 assert_eq!(collected, values);
1217 }
1218
1219 #[test]
1220 fn test_roaring_block_max() {
1221 let mut doc_ids = Vec::new();
1226 let mut term_freqs = Vec::new();
1227
1228 for i in 0..100 {
1230 doc_ids.push(i * 100);
1231 term_freqs.push(if i == 50 { 2 } else { 1 });
1232 }
1233
1234 for i in 0..100 {
1236 doc_ids.push(65536 + i * 100);
1237 term_freqs.push(if i == 25 { 5 } else { 1 });
1238 }
1239
1240 for i in 0..100 {
1242 doc_ids.push(131072 + i * 100);
1243 term_freqs.push(if i == 75 { 3 } else { 1 });
1244 }
1245
1246 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
1247
1248 assert_eq!(list.num_blocks(), 3);
1250 assert_eq!(list.blocks[0].container_key, 0);
1251 assert_eq!(list.blocks[1].container_key, 1);
1252 assert_eq!(list.blocks[2].container_key, 2);
1253
1254 assert_eq!(list.blocks[0].max_tf, 2);
1255 assert_eq!(list.blocks[1].max_tf, 5);
1256 assert_eq!(list.blocks[2].max_tf, 3);
1257
1258 assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
1260 assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
1261
1262 assert_eq!(list.max_score, list.blocks[1].max_block_score);
1264
1265 let mut iter = list.iterator();
1267 iter.init();
1268 assert_eq!(iter.current_block_max_tf(), 2); iter.seek(65536);
1272 assert_eq!(iter.current_block_max_tf(), 5);
1273
1274 iter.seek(131072);
1276 assert_eq!(iter.current_block_max_tf(), 3);
1277 }
1278
1279 #[test]
1280 fn test_roaring_block_max_serialization() {
1281 let mut doc_ids = Vec::new();
1282 let mut term_freqs = Vec::new();
1283
1284 for i in 0..50 {
1286 doc_ids.push(i * 10);
1287 term_freqs.push((i % 5) + 1);
1288 }
1289 for i in 0..50 {
1290 doc_ids.push(65536 + i * 10);
1291 term_freqs.push((i % 3) + 1);
1292 }
1293
1294 let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
1295
1296 let mut buffer = Vec::new();
1297 list.serialize(&mut buffer).unwrap();
1298
1299 let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
1300
1301 assert_eq!(restored.len(), list.len());
1302 assert_eq!(restored.max_tf, list.max_tf);
1303 assert_eq!(restored.max_score, list.max_score);
1304 assert_eq!(restored.num_blocks(), list.num_blocks());
1305
1306 for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
1308 assert_eq!(orig.container_key, rest.container_key);
1309 assert_eq!(orig.first_doc_id, rest.first_doc_id);
1310 assert_eq!(orig.last_doc_id, rest.last_doc_id);
1311 assert_eq!(orig.max_tf, rest.max_tf);
1312 assert_eq!(orig.max_block_score, rest.max_block_score);
1313 }
1314
1315 let mut iter1 = list.iterator();
1317 let mut iter2 = restored.iterator();
1318 iter1.init();
1319 iter2.init();
1320
1321 while !iter1.is_exhausted() {
1322 assert_eq!(iter1.doc(), iter2.doc());
1323 assert_eq!(iter1.term_freq(), iter2.term_freq());
1324 iter1.advance();
1325 iter2.advance();
1326 }
1327 assert!(iter2.is_exhausted());
1328 }
1329}