const BATCH_SIZE: usize = 8;
use crate::error::{Result, ZiporaError};
use crate::algorithms::bit_ops::select_in_word;
use crate::succinct::BitVector;
use std::cmp::Ordering;
const RANK_SAMPLE_RATE: usize = 8;
const SELECT1_SAMPLE_RATE: usize = 256;
const SELECT0_SAMPLE_RATE: usize = 64;
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct EliasFano {
low_bits: Vec<u64>,
high_bits: Vec<u64>,
len: usize,
low_bit_width: u32,
universe: u64,
high_len_bits: usize,
rank_samples: Vec<u32>,
select1_samples: Vec<u32>,
select0_samples: Vec<u32>,
}
impl EliasFano {
pub fn from_sorted(values: &[u32]) -> Self {
if values.is_empty() {
return Self::from_sorted_u64(&[]);
}
let universe = values[values.len() - 1] as u64 + 1;
Self::from_sorted_impl(values.len(), universe, |i| values[i] as u64)
}
pub fn from_sorted_u64(values: &[u64]) -> Self {
if values.is_empty() {
return Self {
low_bits: Vec::new(),
high_bits: Vec::new(),
len: 0,
low_bit_width: 0,
universe: 0,
high_len_bits: 0,
rank_samples: Vec::new(),
select1_samples: Vec::new(),
select0_samples: Vec::new(),
};
}
let universe = values[values.len() - 1] + 1;
Self::from_sorted_impl(values.len(), universe, |i| values[i])
}
fn from_sorted_impl(n: usize, universe: u64, get_val: impl Fn(usize) -> u64) -> Self {
let low_bit_width = if n >= universe as usize {
0
} else {
(64 - (universe / n as u64).leading_zeros()).saturating_sub(1)
};
let low_mask = if low_bit_width == 0 { 0u64 } else { (1u64 << low_bit_width) - 1 };
let total_low_bits = n as u64 * low_bit_width as u64;
let low_words = ((total_low_bits + 63) / 64) as usize;
let mut low_bits = vec![0u64; low_words + 1];
for i in 0..n {
if low_bit_width > 0 {
let low_val = get_val(i) & low_mask;
let bit_pos = i as u64 * low_bit_width as u64;
let word_idx = (bit_pos / 64) as usize;
let bit_idx = (bit_pos % 64) as u32;
low_bits[word_idx] |= low_val << bit_idx;
if bit_idx + low_bit_width > 64 && word_idx + 1 < low_words {
low_bits[word_idx + 1] |= low_val >> (64 - bit_idx);
}
}
}
let last_val = get_val(n - 1);
let max_high = last_val >> low_bit_width;
let high_len_bits = n + max_high as usize + 1;
let high_words = (high_len_bits + 63) / 64;
let mut high_bits = vec![0u64; high_words];
let mut pos = 0usize;
let mut prev_high = 0u64;
for i in 0..n {
let high = get_val(i) >> low_bit_width;
pos += (high - prev_high) as usize;
let word_idx = pos / 64;
let bit_idx = pos % 64;
if word_idx < high_words {
high_bits[word_idx] |= 1u64 << bit_idx;
}
pos += 1;
prev_high = high;
}
let (rank_samples, select1_samples, select0_samples) =
Self::build_samples(&high_bits, high_len_bits);
Self {
low_bits,
high_bits,
len: n,
low_bit_width,
universe,
high_len_bits,
rank_samples,
select1_samples,
select0_samples,
}
}
fn build_samples(high_bits: &[u64], high_len_bits: usize) -> (Vec<u32>, Vec<u32>, Vec<u32>) {
let mut rank_samples = Vec::new();
let mut select1_samples = Vec::new();
let mut select0_samples = Vec::new();
let mut cumul_ones: u32 = 0;
let mut cumul_zeros: u32 = 0;
let mut next_sel1: u32 = 0;
let mut next_sel0: u32 = 0;
for (word_idx, &word) in high_bits.iter().enumerate() {
if word_idx % RANK_SAMPLE_RATE == 0 {
rank_samples.push(cumul_ones);
}
let ones = word.count_ones();
let valid = if (word_idx + 1) * 64 <= high_len_bits {
64u32
} else {
(high_len_bits - word_idx * 64) as u32
};
let zeros = valid - ones;
let after_ones = cumul_ones + ones;
while next_sel1 < after_ones {
select1_samples.push((word_idx * 64) as u32);
next_sel1 += SELECT1_SAMPLE_RATE as u32;
}
let after_zeros = cumul_zeros + zeros;
while next_sel0 < after_zeros {
select0_samples.push((word_idx * 64) as u32);
next_sel0 += SELECT0_SAMPLE_RATE as u32;
}
cumul_ones = after_ones;
cumul_zeros = after_zeros;
}
rank_samples.push(cumul_ones);
(rank_samples, select1_samples, select0_samples)
}
#[inline]
pub fn len(&self) -> usize { self.len }
#[inline]
pub fn is_empty(&self) -> bool { self.len == 0 }
#[inline]
pub fn size_bytes(&self) -> usize {
self.low_bits.len() * 8
+ self.high_bits.len() * 8
+ self.rank_samples.len() * 4
+ self.select1_samples.len() * 4
+ self.select0_samples.len() * 4
+ 32 }
#[inline]
pub fn bits_per_element(&self) -> f64 {
if self.len == 0 { return 0.0; }
(self.size_bytes() * 8) as f64 / self.len as f64
}
pub fn get(&self, index: usize) -> Option<u64> {
if index >= self.len { return None; }
let low = self.get_low(index);
let high_pos = self.select1(index)?;
let high_val = high_pos.checked_sub(index)? as u64;
Some((high_val << self.low_bit_width) | low)
}
#[inline]
pub fn next_geq(&self, target: u64) -> Option<(usize, u64)> {
if self.len == 0 || target >= self.universe { return None; }
let target_high = target >> self.low_bit_width;
let bucket_start_pos = if target_high == 0 {
0
} else {
match self.select0(target_high as usize - 1) {
Some(p) => p + 1,
None => return None,
}
};
let elem_before = self.rank1(bucket_start_pos);
let mut idx = elem_before;
let mut word_idx = bucket_start_pos / 64;
if word_idx >= self.high_bits.len() { return None; }
let start_bit = bucket_start_pos % 64;
let mut word = self.high_bits[word_idx] >> start_bit;
let mut bit_pos = bucket_start_pos;
while idx < self.len {
if word == 0 {
word_idx += 1;
if word_idx >= self.high_bits.len() { return None; }
word = self.high_bits[word_idx];
bit_pos = word_idx * 64;
continue;
}
let tz = word.trailing_zeros() as usize;
bit_pos += tz;
word >>= tz;
let high_val = (bit_pos - idx) as u64;
let low = self.get_low(idx);
let val = (high_val << self.low_bit_width) | low;
if val >= target {
return Some((idx, val));
}
idx += 1;
bit_pos += 1;
word >>= 1;
}
None
}
pub fn iter(&self) -> EliasFanoIter<'_> {
EliasFanoIter { ef: self, pos: 0, high_pos: 0 }
}
#[inline]
fn get_low(&self, index: usize) -> u64 {
if self.low_bit_width == 0 { return 0; }
let bit_pos = index as u64 * self.low_bit_width as u64;
let word_idx = (bit_pos / 64) as usize;
let bit_idx = (bit_pos % 64) as u32;
let w0 = self.low_bits[word_idx];
let w1 = self.low_bits[word_idx + 1];
let combined = w0 as u128 | ((w1 as u128) << 64);
(combined >> bit_idx) as u64 & ((1u64 << self.low_bit_width) - 1)
}
#[inline]
fn select1(&self, rank: usize) -> Option<usize> {
if rank >= self.len { return None; }
let sample_idx = rank / SELECT1_SAMPLE_RATE;
let start_word = if sample_idx < self.select1_samples.len() {
self.select1_samples[sample_idx] as usize / 64
} else {
0
};
let rank_block = start_word / RANK_SAMPLE_RATE;
let mut cumul = self.rank_samples[rank_block] as usize;
for w in (rank_block * RANK_SAMPLE_RATE)..start_word {
cumul += self.high_bits[w].count_ones() as usize;
}
let mut remaining = rank - cumul;
for word_idx in start_word..self.high_bits.len() {
let word = self.high_bits[word_idx];
let ones = word.count_ones() as usize;
if remaining < ones {
return Some(word_idx * 64 + select_in_word(word, remaining));
}
remaining -= ones;
}
None
}
#[inline]
fn select0(&self, rank: usize) -> Option<usize> {
let sample_idx = rank / SELECT0_SAMPLE_RATE;
let start_word = if sample_idx < self.select0_samples.len() {
self.select0_samples[sample_idx] as usize / 64
} else {
0
};
let rank_block = start_word / RANK_SAMPLE_RATE;
let cumul_ones = {
let mut c = self.rank_samples[rank_block] as usize;
for w in (rank_block * RANK_SAMPLE_RATE)..start_word {
c += self.high_bits[w].count_ones() as usize;
}
c
};
let total_bits_before = start_word * 64;
let cumul_zeros = total_bits_before - cumul_ones;
let mut remaining = rank - cumul_zeros;
for word_idx in start_word..self.high_bits.len() {
let word = self.high_bits[word_idx];
let valid_bits = if (word_idx + 1) * 64 <= self.high_len_bits {
64
} else {
self.high_len_bits - word_idx * 64
};
let actual_zeros = valid_bits - word.count_ones() as usize;
if remaining < actual_zeros {
let inverted = !word;
return Some(word_idx * 64 + select_in_word(inverted, remaining));
}
remaining -= actual_zeros;
}
None
}
#[inline]
fn rank1(&self, pos: usize) -> usize {
let full_words = pos / 64;
let remaining_bits = pos % 64;
let sample_idx = full_words / RANK_SAMPLE_RATE;
let mut count = self.rank_samples[sample_idx] as usize;
let scan_start = sample_idx * RANK_SAMPLE_RATE;
for i in scan_start..full_words {
count += self.high_bits[i].count_ones() as usize;
}
if remaining_bits > 0 && full_words < self.high_bits.len() {
let mask = (1u64 << remaining_bits) - 1;
count += (self.high_bits[full_words] & mask).count_ones() as usize;
}
count
}
}
impl EliasFano {
#[inline]
pub fn cursor(&self) -> EliasFanoCursor<'_> {
EliasFanoCursor::new(self)
}
}
impl EliasFano {
#[inline]
pub fn batch_cursor(&self) -> EliasFanoBatchCursor<'_> {
EliasFanoBatchCursor::new(self)
}
}
pub struct EliasFanoIter<'a> {
ef: &'a EliasFano,
pos: usize,
high_pos: usize,
}
impl<'a> Iterator for EliasFanoIter<'a> {
type Item = u64;
fn next(&mut self) -> Option<Self::Item> {
if self.pos >= self.ef.len { return None; }
let next_pos = self.high_pos;
let mut word_idx = next_pos / 64;
if word_idx >= self.ef.high_bits.len() { return None; }
let bit_offset = next_pos % 64;
let mut word = self.ef.high_bits[word_idx] >> bit_offset;
if word != 0 {
let tz = word.trailing_zeros() as usize;
self.high_pos = word_idx * 64 + bit_offset + tz;
} else {
loop {
word_idx += 1;
if word_idx >= self.ef.high_bits.len() { return None; }
word = self.ef.high_bits[word_idx];
if word != 0 {
self.high_pos = word_idx * 64 + word.trailing_zeros() as usize;
break;
}
}
}
let high_val = (self.high_pos - self.pos) as u64;
let low = self.ef.get_low(self.pos);
let val = (high_val << self.ef.low_bit_width) | low;
self.pos += 1;
self.high_pos += 1;
Some(val)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let remaining = self.ef.len - self.pos;
(remaining, Some(remaining))
}
}
impl<'a> ExactSizeIterator for EliasFanoIter<'a> {}
pub struct EliasFanoCursor<'a> {
ef: &'a EliasFano,
index: usize,
high_pos: usize,
}
impl<'a> EliasFanoCursor<'a> {
fn new(ef: &'a EliasFano) -> Self {
if ef.is_empty() {
return Self { ef, index: 0, high_pos: 0 };
}
let mut high_pos = 0;
for (word_idx, &word) in ef.high_bits.iter().enumerate() {
if word != 0 {
high_pos = word_idx * 64 + word.trailing_zeros() as usize;
break;
}
}
Self { ef, index: 0, high_pos }
}
#[inline]
pub fn current(&self) -> Option<u64> {
if self.index >= self.ef.len { return None; }
let high_val = (self.high_pos - self.index) as u64;
let low = self.ef.get_low(self.index);
Some((high_val << self.ef.low_bit_width) | low)
}
#[inline]
pub fn index(&self) -> usize { self.index }
#[inline]
pub fn is_exhausted(&self) -> bool { self.index >= self.ef.len }
#[inline]
pub fn advance(&mut self) -> bool {
self.index += 1;
if self.index >= self.ef.len { return false; }
let next_pos = self.high_pos + 1;
let mut word_idx = next_pos / 64;
if word_idx >= self.ef.high_bits.len() { return false; }
let bit_in_word = next_pos % 64;
let mut word = self.ef.high_bits[word_idx] >> bit_in_word;
if word != 0 {
self.high_pos = word_idx * 64 + bit_in_word + word.trailing_zeros() as usize;
return true;
}
loop {
word_idx += 1;
if word_idx >= self.ef.high_bits.len() { return false; }
word = self.ef.high_bits[word_idx];
if word != 0 {
self.high_pos = word_idx * 64 + word.trailing_zeros() as usize;
return true;
}
}
}
#[inline]
pub fn advance_to_geq(&mut self, target: u64) -> bool {
if let Some(val) = self.current() {
if val >= target { return true; }
} else {
return false;
}
if target >= self.ef.universe { self.index = self.ef.len; return false; }
let target_high = target >> self.ef.low_bit_width;
let bucket_start_pos = if target_high == 0 {
0
} else {
match self.ef.select0(target_high as usize - 1) {
Some(p) => p + 1,
None => { self.index = self.ef.len; return false; }
}
};
let elem_before = self.ef.rank1(bucket_start_pos);
let mut idx = elem_before;
let mut word_idx = bucket_start_pos / 64;
if word_idx >= self.ef.high_bits.len() { self.index = self.ef.len; return false; }
let start_bit = bucket_start_pos % 64;
let mut word = self.ef.high_bits[word_idx] >> start_bit;
let mut bit_pos = bucket_start_pos;
while idx < self.ef.len {
if word == 0 {
word_idx += 1;
if word_idx >= self.ef.high_bits.len() { break; }
word = self.ef.high_bits[word_idx];
bit_pos = word_idx * 64;
continue;
}
let tz = word.trailing_zeros() as usize;
bit_pos += tz;
word >>= tz;
let high_val = (bit_pos - idx) as u64;
let low = self.ef.get_low(idx);
let val = (high_val << self.ef.low_bit_width) | low;
if val >= target {
self.index = idx;
self.high_pos = bit_pos;
return true;
}
idx += 1;
bit_pos += 1;
word >>= 1;
}
self.index = self.ef.len;
false
}
#[inline]
pub fn advance_to_index(&mut self, idx: usize) -> bool {
if idx >= self.ef.len { return false; }
if idx == self.index { return true; }
if let Some(pos) = self.ef.select1(idx) {
self.index = idx;
self.high_pos = pos;
true
} else {
false
}
}
pub fn reset(&mut self) {
*self = Self::new(self.ef);
}
}
pub struct EliasFanoBatchCursor<'a> {
ef: &'a EliasFano,
buffer: [u64; BATCH_SIZE],
buf_pos: usize,
buf_len: usize,
next_elem: usize,
high_pos: usize,
}
impl<'a> EliasFanoBatchCursor<'a> {
fn new(ef: &'a EliasFano) -> Self {
let mut cursor = Self {
ef,
buffer: [0; BATCH_SIZE],
buf_pos: 0,
buf_len: 0,
next_elem: 0,
high_pos: 0,
};
if !ef.is_empty() {
cursor.refill();
}
cursor
}
#[inline]
pub fn current(&self) -> Option<u64> {
if self.buf_pos < self.buf_len {
Some(self.buffer[self.buf_pos])
} else {
None
}
}
#[inline]
pub fn index(&self) -> usize {
self.next_elem - self.buf_len + self.buf_pos
}
#[inline]
pub fn is_exhausted(&self) -> bool {
self.buf_pos >= self.buf_len && self.next_elem >= self.ef.len
}
#[inline]
pub fn advance(&mut self) -> bool {
self.buf_pos += 1;
if self.buf_pos < self.buf_len {
return true;
}
if self.next_elem >= self.ef.len {
return false;
}
self.refill();
self.buf_len > 0
}
fn refill(&mut self) {
let count = (self.ef.len - self.next_elem).min(BATCH_SIZE);
if count == 0 {
self.buf_len = 0;
return;
}
let start_elem = self.next_elem;
let lbw = self.ef.low_bit_width;
let mut bit_pos = self.high_pos;
let mut word_idx = bit_pos / 64;
let start_bit = bit_pos % 64;
let mut word = if word_idx < self.ef.high_bits.len() {
self.ef.high_bits[word_idx] >> start_bit
} else {
0
};
bit_pos = word_idx * 64 + start_bit;
let low_mask = if lbw == 0 { 0u64 } else { (1u64 << lbw) - 1 };
let mut low_bit_pos = start_elem as u64 * lbw as u64;
let mut low_word_idx = (low_bit_pos / 64) as usize;
let mut low_bit_idx = (low_bit_pos % 64) as u32;
let mut low_word = if lbw > 0 && low_word_idx < self.ef.low_bits.len() {
self.ef.low_bits[low_word_idx]
} else { 0 };
let mut low_word_next = if lbw > 0 && low_word_idx + 1 < self.ef.low_bits.len() {
self.ef.low_bits[low_word_idx + 1]
} else { 0 };
for k in 0..count {
let idx = start_elem + k;
while word == 0 {
word_idx += 1;
word = self.ef.high_bits[word_idx];
bit_pos = word_idx * 64;
}
let tz = word.trailing_zeros() as usize;
bit_pos += tz;
word >>= tz;
let high_val = (bit_pos - idx) as u64;
let low = if lbw == 0 {
0u64
} else {
let mut val = low_word >> low_bit_idx;
if low_bit_idx + lbw > 64 {
val |= low_word_next << (64 - low_bit_idx);
}
val & low_mask
};
self.buffer[k] = (high_val << lbw) | low;
bit_pos += 1;
word >>= 1;
if lbw > 0 {
low_bit_idx += lbw;
if low_bit_idx >= 64 {
low_bit_idx -= 64;
low_word_idx += 1;
low_word = low_word_next;
low_word_next = if low_word_idx + 1 < self.ef.low_bits.len() {
self.ef.low_bits[low_word_idx + 1]
} else { 0 };
}
}
}
self.buf_pos = 0;
self.buf_len = count;
self.next_elem = start_elem + count;
self.high_pos = bit_pos;
}
pub fn reset(&mut self) {
*self = Self::new(self.ef);
}
}