#![cfg_attr(feature = "simd", feature(portable_simd))]
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
#[macro_use(quickcheck)]
extern crate quickcheck_macros;
use std::cmp::Ordering;
use std::mem;
mod constants;
mod enum_code;
mod rank_acceleration;
#[cfg(test)]
mod test_helpers;
use self::constants::{
LARGE_BLOCK_SIZE, SELECT_BLOCK_SIZE, SMALL_BLOCK_PER_LARGE_BLOCK, SMALL_BLOCK_SIZE,
};
use self::enum_code::ENUM_CODE_LENGTH;
#[derive(Debug, Clone)]
pub struct RsDict {
len: u64,
num_ones: u64,
num_zeros: u64,
sb_classes: Vec<u8>,
sb_indices: VarintBuffer,
large_blocks: Vec<LargeBlock>,
select_one_inds: Vec<u64>,
select_zero_inds: Vec<u64>,
last_block: LastBlock,
}
impl RsDict {
pub fn from_blocks(blocks: impl Iterator<Item = u64>) -> Self {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if is_x86_feature_detected!("popcnt") {
return unsafe { Self::from_blocks_popcount(blocks) };
}
}
Self::from_blocks_impl(blocks)
}
pub fn heap_size(&self) -> usize {
self.sb_classes.capacity() * mem::size_of::<u8>()
+ self.sb_indices.heap_size()
+ self.large_blocks.capacity() * mem::size_of::<LargeBlock>()
+ self.select_one_inds.capacity() * mem::size_of::<u64>()
+ self.select_zero_inds.capacity() * mem::size_of::<u64>()
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "popcnt")]
unsafe fn from_blocks_popcount(blocks: impl Iterator<Item = u64>) -> Self {
Self::from_blocks_impl(blocks)
}
#[inline(always)]
fn from_blocks_impl(blocks: impl Iterator<Item = u64>) -> Self {
let (_, hint) = blocks.size_hint();
let hint = hint.unwrap_or(0);
let mut large_blocks = Vec::with_capacity(hint / LARGE_BLOCK_SIZE as usize);
let mut select_one_inds = Vec::with_capacity(hint / SELECT_BLOCK_SIZE as usize);
let mut select_zero_inds = Vec::with_capacity(hint / SELECT_BLOCK_SIZE as usize);
let mut sb_classes = Vec::with_capacity(hint / SMALL_BLOCK_SIZE as usize);
let mut sb_indices = VarintBuffer::with_capacity(hint);
let mut last_block = LastBlock::new();
let mut num_ones = 0;
let mut num_zeros = 0;
let mut iter = blocks.enumerate().peekable();
while let Some((i, block)) = iter.next() {
let sb_class = block.count_ones() as u8;
if i as u64 % SMALL_BLOCK_PER_LARGE_BLOCK == 0 {
let lblock = LargeBlock {
rank: num_ones,
pointer: sb_indices.len() as u64,
};
large_blocks.push(lblock);
}
if iter.peek().is_none() {
last_block.bits = block;
last_block.num_ones = sb_class as u64;
last_block.num_zeros = 64 - sb_class as u64;
} else {
sb_classes.push(sb_class);
let (code_len, code) = enum_code::encode(block, sb_class);
sb_indices.push(code_len as usize, code);
}
let lb_start = i as u64 * SMALL_BLOCK_SIZE / LARGE_BLOCK_SIZE;
let start = num_ones + SELECT_BLOCK_SIZE - 1;
let end = num_ones + SELECT_BLOCK_SIZE + sb_class as u64 - 1;
if start / SELECT_BLOCK_SIZE != end / SELECT_BLOCK_SIZE {
select_one_inds.push(lb_start);
}
let start = num_zeros + SELECT_BLOCK_SIZE - 1;
let end = num_zeros + SELECT_BLOCK_SIZE + (64 - sb_class as u64) - 1;
if start / SELECT_BLOCK_SIZE != end / SELECT_BLOCK_SIZE {
select_zero_inds.push(lb_start);
}
num_ones += sb_class as u64;
num_zeros += 64 - sb_class as u64;
}
let num_sb = sb_classes.len();
let align = SMALL_BLOCK_PER_LARGE_BLOCK as usize;
sb_classes.reserve((num_sb + align - 1) / align * align);
Self {
large_blocks,
select_one_inds,
select_zero_inds,
sb_classes,
sb_indices,
len: num_ones + num_zeros,
num_ones,
num_zeros,
last_block,
}
}
pub fn new() -> Self {
Self::with_capacity(0)
}
pub fn with_capacity(n: usize) -> Self {
Self {
large_blocks: Vec::with_capacity(n / LARGE_BLOCK_SIZE as usize),
select_one_inds: Vec::with_capacity(n / SELECT_BLOCK_SIZE as usize),
select_zero_inds: Vec::with_capacity(n / SELECT_BLOCK_SIZE as usize),
sb_classes: Vec::with_capacity(n / SMALL_BLOCK_SIZE as usize),
sb_indices: VarintBuffer::with_capacity(n),
len: 0,
num_ones: 0,
num_zeros: 0,
last_block: LastBlock::new(),
}
}
pub fn rank(&self, pos: u64, bit: bool) -> u64 {
if pos >= self.len {
panic!("Out of bounds position: {} >= {}", pos, self.len);
}
if self.is_last_block(pos) {
let trailing_ones = self.last_block.count_suffix(pos % SMALL_BLOCK_SIZE);
return rank_by_bit(self.num_ones - trailing_ones, pos, bit);
}
let lblock = pos / LARGE_BLOCK_SIZE;
let LargeBlock {
mut pointer,
mut rank,
} = self.large_blocks[lblock as usize];
let sblock_start = (lblock * SMALL_BLOCK_PER_LARGE_BLOCK) as usize;
let sblock = (pos / SMALL_BLOCK_SIZE) as usize;
let (class_sum, length_sum) =
rank_acceleration::scan_block(&self.sb_classes, sblock_start, sblock);
rank += class_sum;
pointer += length_sum;
if pos % SMALL_BLOCK_SIZE != 0 {
let sb_class = self.sb_classes[sblock];
let code = self.read_sb_index(pointer, ENUM_CODE_LENGTH[sb_class as usize]);
rank += enum_code::rank(code, sb_class, pos % SMALL_BLOCK_SIZE);
}
rank_by_bit(rank, pos, bit)
}
pub fn bit_and_one_rank(&self, pos: u64) -> (bool, u64) {
if pos >= self.len {
panic!("Out of bounds position: {} >= {}", pos, self.len);
}
if self.is_last_block(pos) {
let sb_pos = pos % SMALL_BLOCK_SIZE;
let bit = self.last_block.get_bit(sb_pos);
let after_rank = self.last_block.count_suffix(sb_pos);
return (bit, self.num_ones - after_rank);
}
let lblock = pos / LARGE_BLOCK_SIZE;
let sblock = (pos / SMALL_BLOCK_SIZE) as usize;
let sblock_start = (lblock * SMALL_BLOCK_PER_LARGE_BLOCK) as usize;
let LargeBlock {
mut pointer,
mut rank,
} = self.large_blocks[lblock as usize];
for &sb_class in &self.sb_classes[sblock_start..sblock] {
pointer += ENUM_CODE_LENGTH[sb_class as usize] as u64;
rank += sb_class as u64;
}
let sb_class = self.sb_classes[sblock];
let code_length = ENUM_CODE_LENGTH[sb_class as usize];
let code = self.read_sb_index(pointer, code_length);
rank += enum_code::rank(code, sb_class, pos % SMALL_BLOCK_SIZE);
let bit = enum_code::decode_bit(code, sb_class, pos % SMALL_BLOCK_SIZE);
(bit, rank)
}
pub fn inclusive_rank(&self, pos: u64, bit: bool) -> u64 {
let (pos_bit, one_rank) = self.bit_and_one_rank(pos);
rank_by_bit(one_rank, pos, bit) + if pos_bit == bit { 1 } else { 0 }
}
pub fn select(&self, rank: u64, bit: bool) -> Option<u64> {
if bit {
self.select1(rank)
} else {
self.select0(rank)
}
}
pub fn select0(&self, rank: u64) -> Option<u64> {
if rank >= self.num_zeros {
return None;
}
let prefix_num_zeros = self.num_zeros - self.last_block.num_zeros;
if rank >= prefix_num_zeros {
let lb_rank = (rank - prefix_num_zeros) as u8;
return Some(self.last_block_ind() + self.last_block.select0(lb_rank));
}
let select_ind = (rank / SELECT_BLOCK_SIZE) as usize;
let lb_start = self.select_zero_inds[select_ind] as usize;
let mut lblock = None;
for (i, large_block) in self.large_blocks[lb_start..].iter().enumerate() {
let lb_ix = (lb_start + i) as u64;
let lb_rank = lb_ix * LARGE_BLOCK_SIZE - large_block.rank;
if rank < lb_rank {
lblock = Some(lb_ix - 1);
break;
}
}
let lblock = lblock.unwrap_or(self.large_blocks.len() as u64 - 1);
let large_block = &self.large_blocks[lblock as usize];
let sb_start = (lblock * SMALL_BLOCK_PER_LARGE_BLOCK) as usize;
let mut pointer = large_block.pointer;
let mut remaining = rank - (lblock * LARGE_BLOCK_SIZE - large_block.rank);
for (i, &sb_class) in self.sb_classes[sb_start..].iter().enumerate() {
let sb_zeros = (SMALL_BLOCK_SIZE as u8 - sb_class) as u64;
let code_length = ENUM_CODE_LENGTH[sb_class as usize];
if remaining < sb_zeros {
let code = self.read_sb_index(pointer, code_length);
let sb_rank = (sb_start + i) as u64 * SMALL_BLOCK_SIZE;
let block_rank = enum_code::select0(code, sb_class, remaining);
return Some(sb_rank + block_rank);
}
remaining -= sb_zeros;
pointer += code_length as u64;
}
panic!("Ran out of small blocks when iterating over rank");
}
pub fn select1(&self, rank: u64) -> Option<u64> {
if rank >= self.num_ones {
return None;
}
let prefix_num_ones = self.num_ones - self.last_block.num_ones;
if rank >= prefix_num_ones {
let lb_rank = (rank - prefix_num_ones) as u8;
return Some(self.last_block_ind() + self.last_block.select1(lb_rank));
}
let select_ind = (rank / SELECT_BLOCK_SIZE) as usize;
let lb_start = self.select_one_inds[select_ind] as usize;
let mut lblock = None;
for (i, large_block) in self.large_blocks[lb_start..].iter().enumerate() {
if rank < large_block.rank {
lblock = Some((lb_start + i - 1) as u64);
break;
}
}
let lblock = lblock.unwrap_or(self.large_blocks.len() as u64 - 1);
let large_block = &self.large_blocks[lblock as usize];
let sb_start = (lblock * SMALL_BLOCK_PER_LARGE_BLOCK) as usize;
let mut pointer = large_block.pointer;
let mut remaining = rank - large_block.rank;
for (i, &sb_class) in self.sb_classes[sb_start..].iter().enumerate() {
let sb_ones = sb_class as u64;
let code_length = ENUM_CODE_LENGTH[sb_class as usize];
if remaining < sb_ones {
let code = self.read_sb_index(pointer, code_length);
let sb_rank = (sb_start + i) as u64 * SMALL_BLOCK_SIZE;
let block_rank = enum_code::select1(code, sb_class, remaining);
return Some(sb_rank + block_rank);
}
remaining -= sb_ones;
pointer += code_length as u64;
}
panic!("Ran out of small blocks when iterating over rank");
}
pub fn len(&self) -> usize {
self.len as usize
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn count_ones(&self) -> usize {
self.num_ones as usize
}
pub fn count_zeros(&self) -> usize {
self.num_zeros as usize
}
pub fn push(&mut self, bit: bool) {
if self.len % SMALL_BLOCK_SIZE == 0 {
self.write_block();
}
if bit {
self.last_block.set_one(self.len % SMALL_BLOCK_SIZE);
if self.num_ones % SELECT_BLOCK_SIZE == 0 {
self.select_one_inds.push(self.len / LARGE_BLOCK_SIZE);
}
self.num_ones += 1;
} else {
self.last_block.set_zero(self.len % SMALL_BLOCK_SIZE);
if self.num_zeros % SELECT_BLOCK_SIZE == 0 {
self.select_zero_inds.push(self.len / LARGE_BLOCK_SIZE);
}
self.num_zeros += 1;
}
self.len += 1;
}
pub fn get_bit(&self, pos: u64) -> bool {
if self.is_last_block(pos) {
return self.last_block.get_bit(pos % SMALL_BLOCK_SIZE);
}
let lblock = pos / LARGE_BLOCK_SIZE;
let sblock = (pos / SMALL_BLOCK_SIZE) as usize;
let sblock_start = (lblock * SMALL_BLOCK_PER_LARGE_BLOCK) as usize;
let mut pointer = self.large_blocks[lblock as usize].pointer;
for &sb_class in &self.sb_classes[sblock_start..sblock] {
pointer += ENUM_CODE_LENGTH[sb_class as usize] as u64;
}
let sb_class = self.sb_classes[sblock];
let code_length = ENUM_CODE_LENGTH[sb_class as usize];
let code = self.read_sb_index(pointer, code_length);
enum_code::decode_bit(code, sb_class, pos % SMALL_BLOCK_SIZE)
}
pub fn iter(&self) -> impl Iterator<Item = bool> + '_ {
struct RsDictIter<'a> {
rsdict: &'a RsDict,
pos: u64,
}
impl<'a> Iterator for RsDictIter<'a> {
type Item = bool;
fn next(&mut self) -> Option<bool> {
if self.pos >= self.rsdict.len {
return None;
}
let out = self.rsdict.get_bit(self.pos);
self.pos += 1;
Some(out)
}
}
RsDictIter {
rsdict: self,
pos: 0,
}
}
fn write_block(&mut self) {
if self.len > 0 {
let block = mem::replace(&mut self.last_block, LastBlock::new());
let sb_class = block.num_ones as u8;
self.sb_classes.push(sb_class);
let num_sb = self.sb_classes.len();
let align = SMALL_BLOCK_PER_LARGE_BLOCK as usize;
self.sb_classes
.reserve((num_sb + align - 1) / align * align);
let (code_len, code) = enum_code::encode(block.bits, sb_class);
self.sb_indices.push(code_len as usize, code);
}
if self.len % LARGE_BLOCK_SIZE == 0 {
let lblock = LargeBlock {
rank: self.num_ones,
pointer: self.sb_indices.len() as u64,
};
self.large_blocks.push(lblock);
}
}
fn last_block_ind(&self) -> u64 {
if self.len == 0 {
return 0;
}
((self.len - 1) / SMALL_BLOCK_SIZE) * SMALL_BLOCK_SIZE
}
fn is_last_block(&self, pos: u64) -> bool {
pos >= self.last_block_ind()
}
fn read_sb_index(&self, ptr: u64, code_len: u8) -> u64 {
self.sb_indices.get(ptr as usize, code_len as usize)
}
}
impl PartialEq for RsDict {
fn eq(&self, rhs: &Self) -> bool {
self.iter().eq(rhs.iter())
}
}
impl Eq for RsDict {}
impl PartialOrd for RsDict {
fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
self.iter().partial_cmp(rhs.iter())
}
}
impl Ord for RsDict {
fn cmp(&self, rhs: &Self) -> Ordering {
self.iter().cmp(rhs.iter())
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct LargeBlock {
pointer: u64,
rank: u64,
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct VarintBuffer {
buf: Vec<u64>,
len: usize,
}
impl VarintBuffer {
fn with_capacity(bits: usize) -> Self {
Self {
buf: Vec::with_capacity(bits / 64),
len: 0,
}
}
fn push(&mut self, num_bits: usize, value: u64) {
debug_assert!(num_bits <= 64);
if num_bits == 0 {
return;
}
let (block, offset) = (self.len / 64, self.len % 64);
if self.buf.len() == block || offset + num_bits > 64 {
self.buf.push(0);
}
self.buf[block] |= value << offset;
if offset + num_bits > 64 {
self.buf[block + 1] |= value >> (64 - offset);
}
self.len += num_bits;
}
fn get(&self, index: usize, num_bits: usize) -> u64 {
debug_assert!(num_bits <= 64);
if num_bits == 0 {
return 0;
}
let (block, offset) = (index / 64, index % 64);
let mask = 1u64
.checked_shl(num_bits as u32)
.unwrap_or(0)
.wrapping_sub(1);
let mut ret = (self.buf[block] >> offset) & mask;
if offset + num_bits > 64 {
ret |= self.buf[block + 1] << (64 - offset);
}
ret & mask
}
fn heap_size(&self) -> usize {
self.buf.capacity() * mem::size_of::<u64>()
}
fn len(&self) -> usize {
self.len
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct LastBlock {
bits: u64,
num_ones: u64,
num_zeros: u64,
}
impl LastBlock {
fn new() -> Self {
LastBlock {
bits: 0,
num_ones: 0,
num_zeros: 0,
}
}
fn select0(&self, rank: u8) -> u64 {
debug_assert!(rank < self.num_zeros as u8);
enum_code::select1_raw(!self.bits, rank as u64)
}
fn select1(&self, rank: u8) -> u64 {
debug_assert!(rank < self.num_ones as u8);
enum_code::select1_raw(self.bits, rank as u64)
}
fn count_suffix(&self, pos: u64) -> u64 {
(self.bits >> pos).count_ones() as u64
}
fn get_bit(&self, pos: u64) -> bool {
(self.bits >> pos) & 1 == 1
}
fn set_one(&mut self, pos: u64) {
self.bits |= 1 << pos;
self.num_ones += 1;
}
fn set_zero(&mut self, _pos: u64) {
self.num_zeros += 1;
}
}
fn rank_by_bit(x: u64, n: u64, b: bool) -> u64 {
if b {
x
} else {
n - x
}
}
#[cfg(test)]
mod tests {
use super::RsDict;
use crate::test_helpers::hash_u64;
fn hash_u64_blocks(blocks: &[u64]) -> Vec<bool> {
let mut bits = Vec::with_capacity(blocks.len() * 64);
let to_pop = blocks.get(0).unwrap_or(&0) % 64;
for block in blocks {
for i in 0..4 {
let block = hash_u64(block.wrapping_add(i));
if block % 2 != 0 {
for j in 0..64 {
let bit = (block >> j) & 1 != 0;
bits.push(bit);
}
}
}
}
for _ in 0..to_pop {
bits.pop();
}
bits
}
fn check_rsdict(bits: &[bool]) {
let mut rs_dict = RsDict::with_capacity(bits.len());
for &bit in bits {
rs_dict.push(bit);
}
let mut one_rank = 0;
let mut zero_rank = 0;
for (i, &inp_bit) in bits.iter().enumerate() {
assert_eq!(rs_dict.rank(i as u64, false), zero_rank);
assert_eq!(rs_dict.rank(i as u64, true), one_rank);
if inp_bit {
one_rank += 1;
} else {
zero_rank += 1;
}
}
let mut one_rank = 0;
let mut zero_rank = 0;
for (i, &inp_bit) in bits.iter().enumerate() {
if inp_bit {
assert_eq!(rs_dict.select(one_rank as u64, true), Some(i as u64));
one_rank += 1;
} else {
assert_eq!(rs_dict.select(zero_rank as u64, false), Some(i as u64));
zero_rank += 1;
}
}
for r in (one_rank + 1)..bits.len() {
assert_eq!(rs_dict.select(r as u64, true), None);
}
for r in (zero_rank + 1)..bits.len() {
assert_eq!(rs_dict.select(r as u64, false), None);
}
for (i, &bit) in bits.iter().enumerate() {
assert_eq!(rs_dict.get_bit(i as u64), bit);
}
let mut one_rank = 0;
for (i, &bit) in bits.iter().enumerate() {
let (rs_bit, rs_rank) = rs_dict.bit_and_one_rank(i as u64);
assert_eq!((rs_bit, rs_rank), (bit, one_rank));
if bit {
one_rank += 1;
}
}
assert!(bits.iter().cloned().eq(rs_dict.iter()));
assert_eq!(bits, bits)
}
#[quickcheck]
fn qc_from_blocks(blocks: Vec<u64>) {
let bits = hash_u64_blocks(&blocks);
let mut rs_dict = RsDict::with_capacity(bits.len());
for &bit in &bits {
rs_dict.push(bit);
}
let blocks = (0..(bits.len() / 64)).map(|i| {
let mut block = 0u64;
for j in 0..64 {
if bits[i * 64 + j] {
block |= 1 << j;
}
}
block
});
let mut block_rs_dict = RsDict::from_blocks(blocks);
for i in (bits.len() / 64 * 64)..bits.len() {
block_rs_dict.push(bits[i]);
}
assert_eq!(rs_dict.len, block_rs_dict.len);
assert_eq!(rs_dict.num_ones, block_rs_dict.num_ones);
assert_eq!(rs_dict.num_zeros, block_rs_dict.num_zeros);
assert_eq!(rs_dict.sb_classes, block_rs_dict.sb_classes);
assert_eq!(rs_dict.sb_indices, block_rs_dict.sb_indices);
assert_eq!(rs_dict.large_blocks, block_rs_dict.large_blocks);
assert_eq!(rs_dict.select_one_inds, block_rs_dict.select_one_inds);
assert_eq!(rs_dict.select_zero_inds, block_rs_dict.select_zero_inds);
assert_eq!(rs_dict.last_block, block_rs_dict.last_block);
}
#[quickcheck]
fn qc_rsdict(blocks: Vec<u64>) {
check_rsdict(&hash_u64_blocks(&blocks));
}
#[test]
fn test_large_rsdicts() {
check_rsdict(&[true; 65]);
check_rsdict(&[true; 1025]);
check_rsdict(&[true; 3121]);
check_rsdict(&[true; 3185]);
check_rsdict(&[true; 4097]);
check_rsdict(&[true; 8193]);
check_rsdict(&[false; 65]);
check_rsdict(&[false; 1025]);
check_rsdict(&[false; 3121]);
check_rsdict(&[false; 3185]);
check_rsdict(&[false; 4097]);
check_rsdict(&[false; 8193]);
let alternating = &mut [false; 8193];
for i in 0..8193 {
alternating[i] = i % 2 == 0;
}
check_rsdict(alternating);
}
#[test]
fn test_ordering() {
let r1 = RsDict::from_blocks([0u64].iter().cloned());
let r2 = RsDict::from_blocks([1u64].iter().cloned());
assert_ne!(r1, r2);
assert!(r1 < r2);
}
}