use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::io::{self, Read, Write};
const ARRAY_TO_BITMAP_THRESHOLD: usize = 4096;
#[derive(Debug, Clone)]
enum Container {
Array(Vec<u16>),
Bitmap(Box<[u64; 1024]>),
Runs(Vec<(u16, u16)>),
}
impl Container {
fn new_array() -> Self {
Container::Array(Vec::new())
}
#[allow(dead_code)]
fn new_bitmap() -> Self {
Container::Bitmap(Box::new([0u64; 1024]))
}
fn cardinality(&self) -> u32 {
match self {
Container::Array(arr) => arr.len() as u32,
Container::Bitmap(bm) => Self::bitmap_cardinality(bm),
Container::Runs(runs) => runs.iter().map(|(_, len)| *len as u32 + 1).sum(),
}
}
#[inline]
fn bitmap_cardinality(bm: &[u64; 1024]) -> u32 {
#[cfg(target_arch = "aarch64")]
{
use std::arch::aarch64::*;
let mut total = 0u32;
let bytes = bm.as_ptr() as *const u8;
unsafe {
for i in (0..8192).step_by(16) {
let v = vld1q_u8(bytes.add(i));
let cnt = vcntq_u8(v);
total += vaddlvq_u8(cnt) as u32;
}
}
total
}
#[cfg(target_arch = "x86_64")]
{
let mut total = 0u32;
for &word in bm.iter() {
total += word.count_ones();
}
total
}
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
{
bm.iter().map(|w| w.count_ones()).sum()
}
}
fn contains(&self, val: u16) -> bool {
match self {
Container::Array(arr) => arr.binary_search(&val).is_ok(),
Container::Bitmap(bm) => {
let word_idx = (val / 64) as usize;
let bit_idx = val % 64;
(bm[word_idx] >> bit_idx) & 1 == 1
}
Container::Runs(runs) => {
for &(start, len) in runs {
if val >= start && val <= start + len {
return true;
}
if val < start {
return false;
}
}
false
}
}
}
fn insert(&mut self, val: u16) -> bool {
match self {
Container::Array(arr) => {
match arr.binary_search(&val) {
Ok(_) => false, Err(pos) => {
arr.insert(pos, val);
true
}
}
}
Container::Bitmap(bm) => {
let word_idx = (val / 64) as usize;
let bit_idx = val % 64;
let old = bm[word_idx];
bm[word_idx] |= 1u64 << bit_idx;
old != bm[word_idx]
}
Container::Runs(runs) => {
let mut arr: Vec<u16> = Vec::new();
for &(start, len) in runs.iter() {
for i in 0..=len {
arr.push(start + i);
}
}
let inserted = match arr.binary_search(&val) {
Ok(_) => false,
Err(pos) => {
arr.insert(pos, val);
true
}
};
*self = Container::Array(arr);
inserted
}
}
}
fn optimize(&mut self) {
let card = self.cardinality() as usize;
match self {
Container::Array(arr) if card > ARRAY_TO_BITMAP_THRESHOLD => {
let mut bm = Box::new([0u64; 1024]);
for &val in arr.iter() {
let word_idx = (val / 64) as usize;
let bit_idx = val % 64;
bm[word_idx] |= 1u64 << bit_idx;
}
*self = Container::Bitmap(bm);
}
Container::Bitmap(bm) if card <= ARRAY_TO_BITMAP_THRESHOLD => {
let mut arr = Vec::with_capacity(card);
for (word_idx, &word) in bm.iter().enumerate() {
let mut w = word;
while w != 0 {
let bit_idx = w.trailing_zeros();
arr.push((word_idx * 64 + bit_idx as usize) as u16);
w &= w - 1;
}
}
*self = Container::Array(arr);
}
_ => {}
}
self.try_run_encode();
}
fn try_run_encode(&mut self) {
let arr = match self {
Container::Array(arr) if arr.len() >= 4 => arr,
_ => return,
};
let mut runs = Vec::new();
let mut i = 0;
while i < arr.len() {
let start = arr[i];
let mut end = start;
while i + 1 < arr.len() && arr[i + 1] == end + 1 {
end = arr[i + 1];
i += 1;
}
runs.push((start, end - start));
i += 1;
}
if runs.len() * 4 < arr.len() * 2 {
*self = Container::Runs(runs);
}
}
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
match self {
Container::Array(arr) => {
writer.write_u8(0)?; writer.write_u16::<LittleEndian>(arr.len() as u16)?;
for &val in arr {
writer.write_u16::<LittleEndian>(val)?;
}
}
Container::Bitmap(bm) => {
writer.write_u8(1)?; for &word in bm.iter() {
writer.write_u64::<LittleEndian>(word)?;
}
}
Container::Runs(runs) => {
writer.write_u8(2)?; writer.write_u16::<LittleEndian>(runs.len() as u16)?;
for &(start, len) in runs {
writer.write_u16::<LittleEndian>(start)?;
writer.write_u16::<LittleEndian>(len)?;
}
}
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let tag = reader.read_u8()?;
match tag {
0 => {
let len = reader.read_u16::<LittleEndian>()? as usize;
let mut arr = Vec::with_capacity(len);
for _ in 0..len {
arr.push(reader.read_u16::<LittleEndian>()?);
}
Ok(Container::Array(arr))
}
1 => {
let mut bm = Box::new([0u64; 1024]);
for word in bm.iter_mut() {
*word = reader.read_u64::<LittleEndian>()?;
}
Ok(Container::Bitmap(bm))
}
2 => {
let len = reader.read_u16::<LittleEndian>()? as usize;
let mut runs = Vec::with_capacity(len);
for _ in 0..len {
let start = reader.read_u16::<LittleEndian>()?;
let run_len = reader.read_u16::<LittleEndian>()?;
runs.push((start, run_len));
}
Ok(Container::Runs(runs))
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
"Invalid container type",
)),
}
}
fn size_bytes(&self) -> usize {
match self {
Container::Array(arr) => arr.len() * 2 + 4,
Container::Bitmap(_) => 8 * 1024 + 1,
Container::Runs(runs) => runs.len() * 4 + 4,
}
}
}
#[derive(Debug, Clone)]
pub struct RoaringBitmap {
containers: Vec<(u16, Container)>,
}
impl RoaringBitmap {
pub fn new() -> Self {
Self {
containers: Vec::new(),
}
}
pub fn from_sorted_slice(values: &[u32]) -> Self {
let mut bitmap = Self::new();
if values.is_empty() {
return bitmap;
}
let mut current_high = (values[0] >> 16) as u16;
let mut current_container = Container::new_array();
for &val in values {
let high = (val >> 16) as u16;
let low = val as u16;
if high != current_high {
current_container.optimize();
bitmap.containers.push((current_high, current_container));
current_high = high;
current_container = Container::new_array();
}
current_container.insert(low);
}
current_container.optimize();
bitmap.containers.push((current_high, current_container));
bitmap
}
pub fn insert(&mut self, val: u32) -> bool {
let high = (val >> 16) as u16;
let low = val as u16;
match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
Ok(idx) => self.containers[idx].1.insert(low),
Err(idx) => {
let mut container = Container::new_array();
container.insert(low);
self.containers.insert(idx, (high, container));
true
}
}
}
pub fn contains(&self, val: u32) -> bool {
let high = (val >> 16) as u16;
let low = val as u16;
match self.containers.binary_search_by_key(&high, |&(h, _)| h) {
Ok(idx) => self.containers[idx].1.contains(low),
Err(_) => false,
}
}
pub fn cardinality(&self) -> u32 {
self.containers.iter().map(|(_, c)| c.cardinality()).sum()
}
pub fn is_empty(&self) -> bool {
self.containers.is_empty()
}
pub fn optimize(&mut self) {
for (_, container) in &mut self.containers {
container.optimize();
}
}
pub fn and(&self, other: &RoaringBitmap) -> RoaringBitmap {
let mut result = RoaringBitmap::new();
let mut i = 0;
let mut j = 0;
while i < self.containers.len() && j < other.containers.len() {
let (high1, c1) = &self.containers[i];
let (high2, c2) = &other.containers[j];
match high1.cmp(high2) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
let intersected = Self::intersect_containers(c1, c2);
if intersected.cardinality() > 0 {
result.containers.push((*high1, intersected));
}
i += 1;
j += 1;
}
}
}
result
}
fn intersect_containers(c1: &Container, c2: &Container) -> Container {
match (c1, c2) {
(Container::Array(a1), Container::Array(a2)) => {
let mut result = Vec::new();
let mut i = 0;
let mut j = 0;
while i < a1.len() && j < a2.len() {
match a1[i].cmp(&a2[j]) {
std::cmp::Ordering::Less => i += 1,
std::cmp::Ordering::Greater => j += 1,
std::cmp::Ordering::Equal => {
result.push(a1[i]);
i += 1;
j += 1;
}
}
}
Container::Array(result)
}
(Container::Bitmap(b1), Container::Bitmap(b2)) => {
let mut result = Box::new([0u64; 1024]);
for i in 0..1024 {
result[i] = b1[i] & b2[i];
}
let mut c = Container::Bitmap(result);
c.optimize();
c
}
(Container::Array(arr), Container::Bitmap(bm))
| (Container::Bitmap(bm), Container::Array(arr)) => {
let mut result = Vec::new();
for &val in arr {
let word_idx = (val / 64) as usize;
let bit_idx = val % 64;
if (bm[word_idx] >> bit_idx) & 1 == 1 {
result.push(val);
}
}
Container::Array(result)
}
_ => {
let arr1 = Self::container_to_array(c1);
let arr2 = Self::container_to_array(c2);
Self::intersect_containers(&Container::Array(arr1), &Container::Array(arr2))
}
}
}
fn container_to_array(c: &Container) -> Vec<u16> {
match c {
Container::Array(arr) => arr.clone(),
Container::Bitmap(bm) => {
let mut arr = Vec::new();
for (word_idx, &word) in bm.iter().enumerate() {
let mut w = word;
while w != 0 {
let bit_idx = w.trailing_zeros();
arr.push((word_idx * 64 + bit_idx as usize) as u16);
w &= w - 1;
}
}
arr
}
Container::Runs(runs) => {
let mut arr = Vec::new();
for &(start, len) in runs {
for i in 0..=len {
arr.push(start + i);
}
}
arr
}
}
}
pub fn or(&self, other: &RoaringBitmap) -> RoaringBitmap {
let mut result = RoaringBitmap::new();
let mut i = 0;
let mut j = 0;
while i < self.containers.len() || j < other.containers.len() {
if i >= self.containers.len() {
result.containers.push(other.containers[j].clone());
j += 1;
} else if j >= other.containers.len() {
result.containers.push(self.containers[i].clone());
i += 1;
} else {
let (high1, c1) = &self.containers[i];
let (high2, c2) = &other.containers[j];
match high1.cmp(high2) {
std::cmp::Ordering::Less => {
result.containers.push(self.containers[i].clone());
i += 1;
}
std::cmp::Ordering::Greater => {
result.containers.push(other.containers[j].clone());
j += 1;
}
std::cmp::Ordering::Equal => {
let united = Self::union_containers(c1, c2);
result.containers.push((*high1, united));
i += 1;
j += 1;
}
}
}
}
result
}
fn union_containers(c1: &Container, c2: &Container) -> Container {
match (c1, c2) {
(Container::Bitmap(b1), Container::Bitmap(b2)) => {
let mut result = Box::new([0u64; 1024]);
for i in 0..1024 {
result[i] = b1[i] | b2[i];
}
Container::Bitmap(result)
}
_ => {
let arr1 = Self::container_to_array(c1);
let arr2 = Self::container_to_array(c2);
let mut result = Vec::with_capacity(arr1.len() + arr2.len());
let mut i = 0;
let mut j = 0;
while i < arr1.len() && j < arr2.len() {
match arr1[i].cmp(&arr2[j]) {
std::cmp::Ordering::Less => {
result.push(arr1[i]);
i += 1;
}
std::cmp::Ordering::Greater => {
result.push(arr2[j]);
j += 1;
}
std::cmp::Ordering::Equal => {
result.push(arr1[i]);
i += 1;
j += 1;
}
}
}
result.extend_from_slice(&arr1[i..]);
result.extend_from_slice(&arr2[j..]);
let mut c = Container::Array(result);
c.optimize();
c
}
}
}
pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u32::<LittleEndian>(self.containers.len() as u32)?;
for (high, container) in &self.containers {
writer.write_u16::<LittleEndian>(*high)?;
container.serialize(writer)?;
}
Ok(())
}
pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let num_containers = reader.read_u32::<LittleEndian>()? as usize;
let mut containers = Vec::with_capacity(num_containers);
for _ in 0..num_containers {
let high = reader.read_u16::<LittleEndian>()?;
let container = Container::deserialize(reader)?;
containers.push((high, container));
}
Ok(Self { containers })
}
pub fn size_bytes(&self) -> usize {
4 + self
.containers
.iter()
.map(|(_, c)| 2 + c.size_bytes())
.sum::<usize>()
}
pub fn iter(&self) -> RoaringIterator<'_> {
RoaringIterator {
bitmap: self,
container_idx: 0,
value_idx: 0,
current_len: 0,
current_values: Vec::with_capacity(ARRAY_TO_BITMAP_THRESHOLD),
}
}
}
impl Default for RoaringBitmap {
fn default() -> Self {
Self::new()
}
}
pub struct RoaringIterator<'a> {
bitmap: &'a RoaringBitmap,
container_idx: usize,
value_idx: usize,
current_len: usize,
current_values: Vec<u16>,
}
impl<'a> Iterator for RoaringIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.value_idx < self.current_len {
let high = self.bitmap.containers[self.container_idx - 1].0 as u32;
let low = self.current_values[self.value_idx] as u32;
self.value_idx += 1;
return Some((high << 16) | low);
}
if self.container_idx >= self.bitmap.containers.len() {
return None;
}
let (_, container) = &self.bitmap.containers[self.container_idx];
self.current_len = Self::decode_container_into(container, &mut self.current_values);
self.value_idx = 0;
self.container_idx += 1;
}
}
}
impl<'a> RoaringIterator<'a> {
#[inline]
fn decode_container_into(container: &Container, output: &mut Vec<u16>) -> usize {
match container {
Container::Array(arr) => {
let len = arr.len();
if output.len() < len {
output.resize(len, 0);
}
output[..len].copy_from_slice(arr);
len
}
Container::Bitmap(bm) => Self::decode_bitmap_into(bm, output),
Container::Runs(runs) => {
let mut count = 0;
for &(start, len) in runs {
for i in 0..=len {
let val = start + i;
if count >= output.len() {
output.push(val);
} else {
output[count] = val;
}
count += 1;
}
}
count
}
}
}
#[inline]
fn decode_bitmap_into(bm: &[u64; 1024], output: &mut Vec<u16>) -> usize {
let mut count = 0;
for (word_idx, &word) in bm.iter().enumerate() {
if word == 0 {
continue;
}
let base = (word_idx * 64) as u16;
let mut w = word;
while w != 0 {
let bit_idx = w.trailing_zeros() as u16;
let val = base + bit_idx;
if count >= output.len() {
output.push(val);
} else {
output[count] = val;
}
count += 1;
w &= w - 1; }
}
count
}
}
pub const ROARING_BLOCK_SIZE: usize = 65536;
#[derive(Debug, Clone)]
pub struct RoaringBlockInfo {
pub container_key: u16,
pub first_doc_id: u32,
pub last_doc_id: u32,
pub max_tf: u32,
pub max_block_score: f32,
pub num_docs: u32,
}
impl RoaringBlockInfo {
pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u16::<LittleEndian>(self.container_key)?;
writer.write_u32::<LittleEndian>(self.first_doc_id)?;
writer.write_u32::<LittleEndian>(self.last_doc_id)?;
writer.write_u32::<LittleEndian>(self.max_tf)?;
writer.write_f32::<LittleEndian>(self.max_block_score)?;
writer.write_u32::<LittleEndian>(self.num_docs)?;
Ok(())
}
pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok(Self {
container_key: reader.read_u16::<LittleEndian>()?,
first_doc_id: reader.read_u32::<LittleEndian>()?,
last_doc_id: reader.read_u32::<LittleEndian>()?,
max_tf: reader.read_u32::<LittleEndian>()?,
max_block_score: reader.read_f32::<LittleEndian>()?,
num_docs: reader.read_u32::<LittleEndian>()?,
})
}
}
#[derive(Debug, Clone)]
pub struct RoaringPostingList {
pub doc_ids: RoaringBitmap,
pub term_freqs: Vec<(u32, u32)>,
pub default_tf: u32,
pub max_tf: u32,
pub blocks: Vec<RoaringBlockInfo>,
pub max_score: f32,
}
impl RoaringPostingList {
pub fn from_postings(doc_ids: &[u32], term_freqs: &[u32]) -> Self {
Self::from_postings_with_idf(doc_ids, term_freqs, 1.0)
}
pub fn from_postings_with_idf(doc_ids: &[u32], term_freqs: &[u32], idf: f32) -> Self {
assert_eq!(doc_ids.len(), term_freqs.len());
if doc_ids.is_empty() {
return Self {
doc_ids: RoaringBitmap::new(),
term_freqs: Vec::new(),
default_tf: 1,
max_tf: 0,
blocks: Vec::new(),
max_score: 0.0,
};
}
let bitmap = RoaringBitmap::from_sorted_slice(doc_ids);
let mut tf_counts = std::collections::HashMap::new();
for &tf in term_freqs {
*tf_counts.entry(tf).or_insert(0u32) += 1;
}
let default_tf = tf_counts
.iter()
.max_by_key(|&(_, count)| count)
.map(|(&tf, _)| tf)
.unwrap_or(1);
let exceptions: Vec<(u32, u32)> = doc_ids
.iter()
.zip(term_freqs.iter())
.filter(|&(_, &tf)| tf != default_tf)
.map(|(&doc, &tf)| (doc, tf))
.collect();
let max_tf = *term_freqs.iter().max().unwrap_or(&1);
let mut blocks = Vec::new();
let mut max_score = 0.0f32;
let mut i = 0;
while i < doc_ids.len() {
let container_key = (doc_ids[i] >> 16) as u16;
let block_start = i;
while i < doc_ids.len() && (doc_ids[i] >> 16) as u16 == container_key {
i += 1;
}
let block_doc_ids = &doc_ids[block_start..i];
let block_tfs = &term_freqs[block_start..i];
let block_max_tf = *block_tfs.iter().max().unwrap_or(&1);
let block_score = crate::query::bm25_upper_bound(block_max_tf as f32, idf);
max_score = max_score.max(block_score);
blocks.push(RoaringBlockInfo {
container_key,
first_doc_id: block_doc_ids[0],
last_doc_id: *block_doc_ids.last().unwrap(),
max_tf: block_max_tf,
max_block_score: block_score,
num_docs: block_doc_ids.len() as u32,
});
}
Self {
doc_ids: bitmap,
term_freqs: exceptions,
default_tf,
max_tf,
blocks,
max_score,
}
}
pub fn get_tf(&self, doc_id: u32) -> u32 {
match self.term_freqs.binary_search_by_key(&doc_id, |&(d, _)| d) {
Ok(idx) => self.term_freqs[idx].1,
Err(_) => self.default_tf,
}
}
pub fn len(&self) -> u32 {
self.doc_ids.cardinality()
}
pub fn is_empty(&self) -> bool {
self.doc_ids.is_empty()
}
pub fn num_blocks(&self) -> usize {
self.blocks.len()
}
pub fn block_for_doc(&self, doc_id: u32) -> Option<usize> {
let container_key = (doc_id >> 16) as u16;
self.blocks
.binary_search_by_key(&container_key, |b| b.container_key)
.ok()
}
pub fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_ids.serialize(writer)?;
writer.write_u32::<LittleEndian>(self.default_tf)?;
writer.write_u32::<LittleEndian>(self.max_tf)?;
writer.write_f32::<LittleEndian>(self.max_score)?;
writer.write_u32::<LittleEndian>(self.term_freqs.len() as u32)?;
for &(doc, tf) in &self.term_freqs {
writer.write_u32::<LittleEndian>(doc)?;
writer.write_u32::<LittleEndian>(tf)?;
}
writer.write_u32::<LittleEndian>(self.blocks.len() as u32)?;
for block in &self.blocks {
block.serialize(writer)?;
}
Ok(())
}
pub fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let doc_ids = RoaringBitmap::deserialize(reader)?;
let default_tf = reader.read_u32::<LittleEndian>()?;
let max_tf = reader.read_u32::<LittleEndian>()?;
let max_score = reader.read_f32::<LittleEndian>()?;
let num_exceptions = reader.read_u32::<LittleEndian>()? as usize;
let mut term_freqs = Vec::with_capacity(num_exceptions);
for _ in 0..num_exceptions {
let doc = reader.read_u32::<LittleEndian>()?;
let tf = reader.read_u32::<LittleEndian>()?;
term_freqs.push((doc, tf));
}
let num_blocks = reader.read_u32::<LittleEndian>()? as usize;
let mut blocks = Vec::with_capacity(num_blocks);
for _ in 0..num_blocks {
blocks.push(RoaringBlockInfo::deserialize(reader)?);
}
Ok(Self {
doc_ids,
term_freqs,
default_tf,
max_tf,
blocks,
max_score,
})
}
pub fn iterator(&self) -> RoaringPostingIterator<'_> {
RoaringPostingIterator {
list: self,
doc_iter: self.doc_ids.iter(),
current_doc: None,
current_block: 0,
}
}
}
pub struct RoaringPostingIterator<'a> {
list: &'a RoaringPostingList,
doc_iter: RoaringIterator<'a>,
current_doc: Option<u32>,
current_block: usize,
}
impl<'a> RoaringPostingIterator<'a> {
pub fn doc(&self) -> u32 {
self.current_doc.unwrap_or(u32::MAX)
}
pub fn term_freq(&self) -> u32 {
match self.current_doc {
Some(doc) => self.list.get_tf(doc),
None => 0,
}
}
pub fn advance(&mut self) -> u32 {
self.current_doc = self.doc_iter.next();
if let Some(doc) = self.current_doc
&& !self.list.blocks.is_empty()
{
let container_key = (doc >> 16) as u16;
while self.current_block < self.list.blocks.len()
&& self.list.blocks[self.current_block].container_key < container_key
{
self.current_block += 1;
}
}
self.doc()
}
pub fn init(&mut self) {
self.current_doc = self.doc_iter.next();
self.current_block = 0;
}
pub fn seek(&mut self, target: u32) -> u32 {
if !self.list.blocks.is_empty() {
let target_container = (target >> 16) as u16;
while self.current_block < self.list.blocks.len()
&& self.list.blocks[self.current_block].last_doc_id < target
{
self.current_block += 1;
}
if self.current_block >= self.list.blocks.len() {
self.current_doc = None;
return u32::MAX;
}
let block = &self.list.blocks[self.current_block];
if block.container_key > target_container
|| (block.container_key == target_container && block.first_doc_id > self.doc())
{
while let Some(doc) = self.current_doc {
if doc >= block.first_doc_id {
break;
}
self.current_doc = self.doc_iter.next();
}
}
}
while let Some(doc) = self.current_doc {
if doc >= target {
return doc;
}
self.current_doc = self.doc_iter.next();
}
u32::MAX
}
pub fn is_exhausted(&self) -> bool {
self.current_doc.is_none()
}
pub fn current_block_max_score(&self) -> f32 {
if self.current_doc.is_none() || self.list.blocks.is_empty() {
return 0.0;
}
if self.current_block < self.list.blocks.len() {
self.list.blocks[self.current_block].max_block_score
} else {
0.0
}
}
pub fn current_block_max_tf(&self) -> u32 {
if self.current_doc.is_none() || self.list.blocks.is_empty() {
return 0;
}
if self.current_block < self.list.blocks.len() {
self.list.blocks[self.current_block].max_tf
} else {
0
}
}
pub fn max_remaining_score(&self) -> f32 {
if self.current_doc.is_none() || self.list.blocks.is_empty() {
return 0.0;
}
self.list.blocks[self.current_block..]
.iter()
.map(|b| b.max_block_score)
.fold(0.0f32, |a, b| a.max(b))
}
pub fn skip_to_block_with_doc(&mut self, target: u32) -> Option<(u32, f32)> {
if self.list.blocks.is_empty() {
return None;
}
while self.current_block < self.list.blocks.len() {
let block = &self.list.blocks[self.current_block];
if block.last_doc_id >= target {
while let Some(doc) = self.current_doc {
if doc >= block.first_doc_id {
break;
}
self.current_doc = self.doc_iter.next();
}
return Some((block.first_doc_id, block.max_block_score));
}
self.current_block += 1;
}
self.current_doc = None;
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_roaring_basic() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(1);
bitmap.insert(100);
bitmap.insert(1000);
bitmap.insert(100000);
assert!(bitmap.contains(1));
assert!(bitmap.contains(100));
assert!(bitmap.contains(1000));
assert!(bitmap.contains(100000));
assert!(!bitmap.contains(2));
assert!(!bitmap.contains(50000));
assert_eq!(bitmap.cardinality(), 4);
}
#[test]
fn test_roaring_from_sorted() {
let values: Vec<u32> = (0..10000).map(|i| i * 3).collect();
let bitmap = RoaringBitmap::from_sorted_slice(&values);
assert_eq!(bitmap.cardinality(), 10000);
for &val in &values {
assert!(bitmap.contains(val), "Missing value {}", val);
}
}
#[test]
fn test_roaring_intersection() {
let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3, 100, 200, 300]);
let b = RoaringBitmap::from_sorted_slice(&[2, 3, 4, 200, 300, 400]);
let result = a.and(&b);
assert_eq!(result.cardinality(), 4);
assert!(result.contains(2));
assert!(result.contains(3));
assert!(result.contains(200));
assert!(result.contains(300));
}
#[test]
fn test_roaring_union() {
let a = RoaringBitmap::from_sorted_slice(&[1, 2, 3]);
let b = RoaringBitmap::from_sorted_slice(&[3, 4, 5]);
let result = a.or(&b);
assert_eq!(result.cardinality(), 5);
for i in 1..=5 {
assert!(result.contains(i));
}
}
#[test]
fn test_roaring_serialization() {
let values: Vec<u32> = (0..1000).map(|i| i * 7).collect();
let bitmap = RoaringBitmap::from_sorted_slice(&values);
let mut buffer = Vec::new();
bitmap.serialize(&mut buffer).unwrap();
let restored = RoaringBitmap::deserialize(&mut &buffer[..]).unwrap();
assert_eq!(restored.cardinality(), bitmap.cardinality());
for &val in &values {
assert!(restored.contains(val));
}
}
#[test]
fn test_roaring_posting_list() {
let doc_ids: Vec<u32> = vec![1, 5, 10, 50, 100, 500, 1000];
let term_freqs: Vec<u32> = vec![1, 1, 2, 1, 3, 1, 1];
let list = RoaringPostingList::from_postings(&doc_ids, &term_freqs);
assert_eq!(list.len(), 7);
assert_eq!(list.default_tf, 1);
assert_eq!(list.max_tf, 3);
assert_eq!(list.get_tf(1), 1);
assert_eq!(list.get_tf(10), 2);
assert_eq!(list.get_tf(100), 3);
}
#[test]
fn test_roaring_iterator() {
let values: Vec<u32> = vec![1, 10, 100, 1000, 10000];
let bitmap = RoaringBitmap::from_sorted_slice(&values);
let collected: Vec<u32> = bitmap.iter().collect();
assert_eq!(collected, values);
}
#[test]
fn test_roaring_block_max() {
let mut doc_ids = Vec::new();
let mut term_freqs = Vec::new();
for i in 0..100 {
doc_ids.push(i * 100);
term_freqs.push(if i == 50 { 2 } else { 1 });
}
for i in 0..100 {
doc_ids.push(65536 + i * 100);
term_freqs.push(if i == 25 { 5 } else { 1 });
}
for i in 0..100 {
doc_ids.push(131072 + i * 100);
term_freqs.push(if i == 75 { 3 } else { 1 });
}
let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 2.0);
assert_eq!(list.num_blocks(), 3);
assert_eq!(list.blocks[0].container_key, 0);
assert_eq!(list.blocks[1].container_key, 1);
assert_eq!(list.blocks[2].container_key, 2);
assert_eq!(list.blocks[0].max_tf, 2);
assert_eq!(list.blocks[1].max_tf, 5);
assert_eq!(list.blocks[2].max_tf, 3);
assert!(list.blocks[1].max_block_score > list.blocks[0].max_block_score);
assert!(list.blocks[1].max_block_score > list.blocks[2].max_block_score);
assert_eq!(list.max_score, list.blocks[1].max_block_score);
let mut iter = list.iterator();
iter.init();
assert_eq!(iter.current_block_max_tf(), 2);
iter.seek(65536);
assert_eq!(iter.current_block_max_tf(), 5);
iter.seek(131072);
assert_eq!(iter.current_block_max_tf(), 3);
}
#[test]
fn test_roaring_block_max_serialization() {
let mut doc_ids = Vec::new();
let mut term_freqs = Vec::new();
for i in 0..50 {
doc_ids.push(i * 10);
term_freqs.push((i % 5) + 1);
}
for i in 0..50 {
doc_ids.push(65536 + i * 10);
term_freqs.push((i % 3) + 1);
}
let list = RoaringPostingList::from_postings_with_idf(&doc_ids, &term_freqs, 1.5);
let mut buffer = Vec::new();
list.serialize(&mut buffer).unwrap();
let restored = RoaringPostingList::deserialize(&mut &buffer[..]).unwrap();
assert_eq!(restored.len(), list.len());
assert_eq!(restored.max_tf, list.max_tf);
assert_eq!(restored.max_score, list.max_score);
assert_eq!(restored.num_blocks(), list.num_blocks());
for (orig, rest) in list.blocks.iter().zip(restored.blocks.iter()) {
assert_eq!(orig.container_key, rest.container_key);
assert_eq!(orig.first_doc_id, rest.first_doc_id);
assert_eq!(orig.last_doc_id, rest.last_doc_id);
assert_eq!(orig.max_tf, rest.max_tf);
assert_eq!(orig.max_block_score, rest.max_block_score);
}
let mut iter1 = list.iterator();
let mut iter2 = restored.iterator();
iter1.init();
iter2.init();
while !iter1.is_exhausted() {
assert_eq!(iter1.doc(), iter2.doc());
assert_eq!(iter1.term_freq(), iter2.term_freq());
iter1.advance();
iter2.advance();
}
assert!(iter2.is_exhausted());
}
}