#[cfg(not(test))]
use alloc::vec;
#[cfg(not(test))]
use alloc::vec::Vec;
use core::cell::Cell;
use crate::util::broadword::select_in_word;
pub(super) const SELECT_SAMPLE_RATE: usize = 256;
#[derive(Clone, Debug)]
pub enum OpenPositions {
Compact(AdvancePositions),
Dense(Vec<u32>),
}
impl OpenPositions {
pub fn build(positions: &[u32], text_len: usize) -> Self {
let is_monotonic = positions.windows(2).all(|w| w[0] <= w[1]);
if is_monotonic {
OpenPositions::Compact(AdvancePositions::build_unchecked(positions, text_len))
} else {
OpenPositions::Dense(positions.to_vec())
}
}
#[inline]
pub fn len(&self) -> usize {
match self {
OpenPositions::Compact(ap) => ap.len(),
OpenPositions::Dense(v) => v.len(),
}
}
#[inline]
pub fn is_empty(&self) -> bool {
match self {
OpenPositions::Compact(ap) => ap.is_empty(),
OpenPositions::Dense(v) => v.is_empty(),
}
}
#[inline]
pub fn get(&self, open_idx: usize) -> Option<u32> {
match self {
OpenPositions::Compact(ap) => ap.get(open_idx),
OpenPositions::Dense(v) => v.get(open_idx).copied(),
}
}
pub fn find_last_open_at_text_pos(&self, text_pos: usize) -> Option<usize> {
match self {
OpenPositions::Compact(ap) => ap.find_last_open_at_text_pos(text_pos),
OpenPositions::Dense(v) => {
let text_pos_u32 = text_pos as u32;
let search_result = v.binary_search(&text_pos_u32);
match search_result {
Ok(idx) => {
let mut last = idx;
while last + 1 < v.len() && v[last + 1] == text_pos_u32 {
last += 1;
}
Some(last)
}
Err(_) => None,
}
}
}
}
pub fn heap_size(&self) -> usize {
match self {
OpenPositions::Compact(ap) => ap.heap_size(),
OpenPositions::Dense(v) => v.len() * 4,
}
}
#[inline]
pub fn is_compact(&self) -> bool {
matches!(self, OpenPositions::Compact(_))
}
}
#[derive(Clone, Copy, Debug)]
struct SequentialCursor {
next_open_idx: usize,
adv_cumulative: usize,
ib_word_idx: usize,
ib_ones_before: usize,
last_ib_arg: usize,
last_ib_result: usize,
}
impl Default for SequentialCursor {
fn default() -> Self {
Self {
next_open_idx: 0,
adv_cumulative: 0,
ib_word_idx: 0,
ib_ones_before: 0,
last_ib_arg: usize::MAX,
last_ib_result: 0,
}
}
}
#[derive(Clone, Debug)]
pub struct AdvancePositions {
ib_words: Vec<u64>,
ib_len: usize,
ib_rank: Vec<u32>,
ib_select_samples: Vec<u32>,
ib_ones: usize,
advance_words: Vec<u64>,
num_opens: usize,
advance_rank: Vec<u32>,
cursor: Cell<SequentialCursor>,
}
#[derive(Clone, Debug)]
pub struct AdvancePositionsCursor<'a> {
positions: &'a AdvancePositions,
open_idx: usize,
text_pos: usize,
advance_rank: usize,
ib_word_idx: usize,
ib_remaining_bits: u64,
}
impl AdvancePositions {
pub fn build_unchecked(positions: &[u32], text_len: usize) -> Self {
debug_assert!(
positions.windows(2).all(|w| w[0] <= w[1]),
"AdvancePositions::build_unchecked requires monotonically non-decreasing positions"
);
if positions.is_empty() {
return Self {
ib_words: Vec::new(),
ib_len: text_len,
ib_rank: vec![0],
ib_select_samples: Vec::new(),
ib_ones: 0,
advance_words: Vec::new(),
num_opens: 0,
advance_rank: vec![0],
cursor: Cell::default(),
};
}
let num_opens = positions.len();
let ib_num_words = text_len.div_ceil(64);
let mut ib_words = vec![0u64; ib_num_words];
let advance_num_words = num_opens.div_ceil(64);
let mut advance_words = vec![0u64; advance_num_words];
let mut prev_pos: Option<u32> = None;
let mut ib_ones = 0usize;
for (i, &pos) in positions.iter().enumerate() {
let is_new_position = prev_pos != Some(pos);
if is_new_position {
let word_idx = pos as usize / 64;
let bit_idx = pos as usize % 64;
if word_idx < ib_words.len() {
if (ib_words[word_idx] >> bit_idx) & 1 == 0 {
ib_words[word_idx] |= 1u64 << bit_idx;
ib_ones += 1;
}
}
let adv_word_idx = i / 64;
let adv_bit_idx = i % 64;
advance_words[adv_word_idx] |= 1u64 << adv_bit_idx;
}
prev_pos = Some(pos);
}
let ib_rank = build_cumulative_rank(&ib_words);
let advance_rank = build_cumulative_rank(&advance_words);
let ib_select_samples = build_select_samples(&ib_words, ib_ones);
Self {
ib_words,
ib_len: text_len,
ib_rank,
ib_select_samples,
ib_ones,
advance_words,
num_opens,
advance_rank,
cursor: Cell::default(),
}
}
#[inline]
pub fn len(&self) -> usize {
self.num_opens
}
#[inline]
pub fn is_empty(&self) -> bool {
self.num_opens == 0
}
#[inline(always)]
pub fn get(&self, open_idx: usize) -> Option<u32> {
let cursor = self.cursor.get();
if open_idx == cursor.next_open_idx {
self.get_sequential(open_idx, cursor)
} else if open_idx > cursor.next_open_idx {
let mut cursor = cursor;
self.advance_cursor_to(&mut cursor, open_idx);
self.get_sequential(open_idx, cursor)
} else {
self.get_random(open_idx)
}
}
#[inline]
fn get_sequential(&self, open_idx: usize, mut cursor: SequentialCursor) -> Option<u32> {
if open_idx >= self.num_opens {
return None;
}
let word_idx = open_idx / 64;
let bit_idx = open_idx % 64;
let adv_bit = if word_idx < self.advance_words.len() {
(self.advance_words[word_idx] >> bit_idx) & 1
} else {
0
};
let advance_count = cursor.adv_cumulative + adv_bit as usize;
cursor.adv_cumulative = advance_count;
cursor.next_open_idx = open_idx + 1;
if advance_count == 0 {
self.cursor.set(cursor);
return None;
}
let k = advance_count - 1;
if k == cursor.last_ib_arg {
self.cursor.set(cursor);
return Some(cursor.last_ib_result as u32);
}
let mut remaining = k - cursor.ib_ones_before;
let mut wi = cursor.ib_word_idx;
while wi < self.ib_words.len() {
let word = self.ib_words[wi];
let ones = word.count_ones() as usize;
if remaining < ones {
let bit_pos = select_in_word(word, remaining as u32) as usize;
let result = wi * 64 + bit_pos;
cursor.ib_ones_before = k - remaining;
cursor.ib_word_idx = wi;
cursor.last_ib_arg = k;
cursor.last_ib_result = result;
self.cursor.set(cursor);
return Some(result as u32);
}
remaining -= ones;
wi += 1;
}
self.cursor.set(cursor);
None
}
#[inline]
fn advance_cursor_to(&self, cursor: &mut SequentialCursor, target: usize) {
cursor.adv_cumulative = self.advance_rank1(target);
cursor.next_open_idx = target;
}
fn get_random(&self, open_idx: usize) -> Option<u32> {
if open_idx >= self.num_opens {
return None;
}
let advance_count = self.advance_rank1(open_idx + 1);
let result = if advance_count == 0 {
None
} else {
self.ib_select1(advance_count - 1).map(|pos| pos as u32)
};
self.cursor.set(SequentialCursor {
next_open_idx: open_idx + 1,
adv_cumulative: advance_count,
ib_word_idx: 0,
ib_ones_before: 0,
last_ib_arg: usize::MAX,
last_ib_result: 0,
});
result
}
#[inline]
pub fn cursor(&self) -> AdvancePositionsCursor<'_> {
if self.is_empty() {
return AdvancePositionsCursor {
positions: self,
open_idx: 0,
text_pos: 0,
advance_rank: 0,
ib_word_idx: 0,
ib_remaining_bits: 0,
};
}
let text_pos = self.ib_select1(0).unwrap_or(0);
let ib_word_idx = text_pos / 64;
let ib_bit_offset = text_pos % 64;
let ib_remaining_bits = if ib_word_idx < self.ib_words.len() {
self.ib_words[ib_word_idx] & !((1u64 << ib_bit_offset) - 1)
} else {
0
};
AdvancePositionsCursor {
positions: self,
open_idx: 0,
text_pos,
advance_rank: 1, ib_word_idx,
ib_remaining_bits,
}
}
#[inline]
pub fn cursor_from(&self, open_idx: usize) -> AdvancePositionsCursor<'_> {
if open_idx >= self.num_opens {
return AdvancePositionsCursor {
positions: self,
open_idx: self.num_opens,
text_pos: 0,
advance_rank: self.ib_ones,
ib_word_idx: self.ib_words.len(),
ib_remaining_bits: 0,
};
}
if open_idx == 0 {
return self.cursor();
}
let advance_rank = self.advance_rank1(open_idx + 1);
let text_pos = if advance_rank > 0 {
self.ib_select1(advance_rank - 1).unwrap_or(0)
} else {
0
};
let ib_word_idx = text_pos / 64;
let ib_bit_offset = text_pos % 64;
let ib_remaining_bits = if ib_word_idx < self.ib_words.len() {
self.ib_words[ib_word_idx] & !((1u64 << ib_bit_offset) - 1)
} else {
0
};
AdvancePositionsCursor {
positions: self,
open_idx,
text_pos,
advance_rank,
ib_word_idx,
ib_remaining_bits,
}
}
#[inline]
fn advance_rank1(&self, pos: usize) -> usize {
if pos == 0 {
return 0;
}
let word_idx = pos / 64;
let bit_idx = pos % 64;
let mut count = self.advance_rank[word_idx.min(self.advance_words.len())] as usize;
if word_idx < self.advance_words.len() && bit_idx > 0 {
let mask = (1u64 << bit_idx) - 1;
count += (self.advance_words[word_idx] & mask).count_ones() as usize;
}
count
}
#[inline]
fn advance_bit(&self, pos: usize) -> bool {
if pos >= self.num_opens {
return false;
}
let word_idx = pos / 64;
let bit_idx = pos % 64;
(self.advance_words[word_idx] >> bit_idx) & 1 == 1
}
#[inline]
fn ib_select1(&self, k: usize) -> Option<usize> {
if k >= self.ib_ones {
return None;
}
let sample_idx = k / SELECT_SAMPLE_RATE;
let (start_word, mut remaining) = if sample_idx < self.ib_select_samples.len() {
let sample_pos = self.ib_select_samples[sample_idx] as usize;
let skip_ones = sample_idx * SELECT_SAMPLE_RATE;
(sample_pos / 64, k - skip_ones)
} else {
(0, k)
};
for word_idx in start_word..self.ib_words.len() {
let word = if word_idx == start_word && sample_idx < self.ib_select_samples.len() {
let sample_pos = self.ib_select_samples[sample_idx] as usize;
let bit_offset = sample_pos % 64;
self.ib_words[word_idx] & !((1u64 << bit_offset) - 1)
} else {
self.ib_words[word_idx]
};
let ones = word.count_ones() as usize;
if ones > remaining {
let bit_pos = select_in_word(word, remaining as u32) as usize;
return Some(word_idx * 64 + bit_pos);
}
remaining -= ones;
}
None
}
pub fn heap_size(&self) -> usize {
self.ib_words.len() * 8
+ self.ib_rank.len() * 4
+ self.ib_select_samples.len() * 4
+ self.advance_words.len() * 8
+ self.advance_rank.len() * 4
}
pub fn find_last_open_at_text_pos(&self, text_pos: usize) -> Option<usize> {
if self.num_opens == 0 || text_pos >= self.ib_len {
return None;
}
let word_idx = text_pos / 64;
let bit_idx = text_pos % 64;
if word_idx >= self.ib_words.len() {
return None;
}
if (self.ib_words[word_idx] >> bit_idx) & 1 == 0 {
return None; }
let ib_rank = self.ib_rank1(text_pos);
let first_open = self.advance_select1(ib_rank)?;
let mut last_open = first_open;
while last_open + 1 < self.num_opens && !self.advance_bit(last_open + 1) {
last_open += 1;
}
Some(last_open)
}
#[inline]
fn ib_rank1(&self, pos: usize) -> usize {
if pos == 0 {
return 0;
}
let word_idx = pos / 64;
let bit_idx = pos % 64;
let mut count = self.ib_rank[word_idx.min(self.ib_words.len())] as usize;
if word_idx < self.ib_words.len() && bit_idx > 0 {
let mask = (1u64 << bit_idx) - 1;
count += (self.ib_words[word_idx] & mask).count_ones() as usize;
}
count
}
#[inline]
fn advance_select1(&self, k: usize) -> Option<usize> {
let total_advances = *self.advance_rank.last().unwrap_or(&0) as usize;
if k >= total_advances {
return None;
}
let mut remaining = k;
for (word_idx, &word) in self.advance_words.iter().enumerate() {
let ones = word.count_ones() as usize;
if ones > remaining {
let bit_pos = select_in_word(word, remaining as u32) as usize;
return Some(word_idx * 64 + bit_pos);
}
remaining -= ones;
}
None
}
}
impl<'a> AdvancePositionsCursor<'a> {
#[inline]
pub fn current(&self) -> Option<u32> {
if self.open_idx >= self.positions.num_opens {
return None;
}
Some(self.text_pos as u32)
}
#[inline]
pub fn index(&self) -> usize {
self.open_idx
}
#[inline]
pub fn is_exhausted(&self) -> bool {
self.open_idx >= self.positions.num_opens
}
#[inline]
pub fn advance_one(&mut self) -> Option<u32> {
if self.open_idx + 1 >= self.positions.num_opens {
self.open_idx = self.positions.num_opens;
return None;
}
self.open_idx += 1;
if !self.positions.advance_bit(self.open_idx) {
return Some(self.text_pos as u32);
}
self.advance_rank += 1;
self.advance_ib_cursor();
Some(self.text_pos as u32)
}
#[inline]
fn advance_ib_cursor(&mut self) {
self.ib_remaining_bits &= self.ib_remaining_bits.wrapping_sub(1);
if self.ib_remaining_bits != 0 {
let bit_pos = self.ib_remaining_bits.trailing_zeros() as usize;
self.text_pos = self.ib_word_idx * 64 + bit_pos;
} else {
self.ib_word_idx += 1;
while self.ib_word_idx < self.positions.ib_words.len()
&& self.positions.ib_words[self.ib_word_idx] == 0
{
self.ib_word_idx += 1;
}
if self.ib_word_idx < self.positions.ib_words.len() {
self.ib_remaining_bits = self.positions.ib_words[self.ib_word_idx];
let bit_pos = self.ib_remaining_bits.trailing_zeros() as usize;
self.text_pos = self.ib_word_idx * 64 + bit_pos;
}
}
}
}
pub(super) fn build_cumulative_rank(words: &[u64]) -> Vec<u32> {
let mut rank = Vec::with_capacity(words.len() + 1);
let mut cumulative: u32 = 0;
rank.push(0);
for &word in words {
cumulative += word.count_ones();
rank.push(cumulative);
}
rank
}
pub(super) fn build_select_samples(words: &[u64], total_ones: usize) -> Vec<u32> {
if total_ones == 0 {
return Vec::new();
}
let num_samples = total_ones.div_ceil(SELECT_SAMPLE_RATE);
let mut samples = Vec::with_capacity(num_samples);
let mut ones_seen = 0usize;
let mut sample_target = 0usize;
'outer: for (word_idx, &word) in words.iter().enumerate() {
if word == 0 {
continue;
}
let word_ones = word.count_ones() as usize;
while sample_target < ones_seen + word_ones {
let local_rank = sample_target - ones_seen;
let bit_pos = select_in_word(word, local_rank as u32) as usize;
let global_pos = word_idx * 64 + bit_pos;
samples.push(global_pos as u32);
sample_target += SELECT_SAMPLE_RATE;
if samples.len() >= num_samples {
break 'outer;
}
}
ones_seen += word_ones;
}
samples
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty() {
let ap = AdvancePositions::build_unchecked(&[], 0);
assert!(ap.is_empty());
assert_eq!(ap.len(), 0);
assert_eq!(ap.get(0), None);
let cursor = ap.cursor();
assert!(cursor.is_exhausted());
assert_eq!(cursor.current(), None);
}
#[test]
fn test_single_position() {
let ap = AdvancePositions::build_unchecked(&[42], 100);
assert_eq!(ap.len(), 1);
assert_eq!(ap.get(0), Some(42));
assert_eq!(ap.get(1), None);
let mut cursor = ap.cursor();
assert_eq!(cursor.current(), Some(42));
assert_eq!(cursor.advance_one(), None);
assert!(cursor.is_exhausted());
}
#[test]
fn test_no_duplicates() {
let positions = vec![0, 10, 20, 30, 40];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.len(), 5);
for (i, &expected) in positions.iter().enumerate() {
assert_eq!(ap.get(i), Some(expected), "get({}) failed", i);
}
let mut cursor = ap.cursor();
for &expected in &positions {
assert_eq!(cursor.current(), Some(expected));
cursor.advance_one();
}
assert!(cursor.is_exhausted());
}
#[test]
fn test_with_duplicates() {
let positions = vec![0, 0, 10, 10, 10, 20];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.len(), 6);
for (i, &expected) in positions.iter().enumerate() {
assert_eq!(ap.get(i), Some(expected), "get({}) failed", i);
}
let mut cursor = ap.cursor();
for &expected in &positions {
assert_eq!(cursor.current(), Some(expected));
cursor.advance_one();
}
assert!(cursor.is_exhausted());
}
#[test]
fn test_all_duplicates() {
let positions = vec![5, 5, 5, 5, 5];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.len(), 5);
for i in 0..5 {
assert_eq!(ap.get(i), Some(5));
}
let mut cursor = ap.cursor();
for _ in 0..5 {
assert_eq!(cursor.current(), Some(5));
cursor.advance_one();
}
assert!(cursor.is_exhausted());
}
#[test]
fn test_cursor_from() {
let positions = vec![0, 0, 10, 20, 20, 30];
let ap = AdvancePositions::build_unchecked(&positions, 100);
for (i, &expected) in positions.iter().enumerate() {
let cursor = ap.cursor_from(i);
assert_eq!(cursor.index(), i);
assert_eq!(
cursor.current(),
Some(expected),
"cursor_from({}) failed",
i
);
}
let cursor = ap.cursor_from(10);
assert!(cursor.is_exhausted());
}
#[test]
fn test_cursor_from_then_iterate() {
let positions: Vec<u32> = (0..100).map(|i| i * 10).collect();
let ap = AdvancePositions::build_unchecked(&positions, 1000);
let mut cursor = ap.cursor_from(50);
let mut collected = Vec::new();
while let Some(v) = cursor.current() {
collected.push(v);
cursor.advance_one();
}
assert_eq!(collected, positions[50..].to_vec());
}
#[test]
fn test_memory_savings() {
let mut positions = Vec::with_capacity(1000);
let mut pos = 0u32;
for i in 0..1000 {
positions.push(pos);
if i % 3 != 0 {
pos += 10 + (i as u32 % 20);
}
}
let ap = AdvancePositions::build_unchecked(&positions, pos as usize + 100);
let vec_size = positions.len() * 4;
let ap_size = ap.heap_size();
eprintln!(
"Memory: Vec<u32>={} bytes, AdvancePositions={} bytes ({:.2}x compression)",
vec_size,
ap_size,
vec_size as f64 / ap_size as f64
);
assert!(
ap_size < vec_size,
"AdvancePositions {} should be smaller than Vec<u32> {}",
ap_size,
vec_size
);
}
#[test]
fn test_large_positions() {
let positions = vec![0, 100_000, 200_000, 300_000];
let ap = AdvancePositions::build_unchecked(&positions, 400_000);
for (i, &expected) in positions.iter().enumerate() {
assert_eq!(ap.get(i), Some(expected));
}
}
#[test]
fn test_many_elements_with_samples() {
let positions: Vec<u32> = (0..1000).map(|i| i * 5).collect();
let ap = AdvancePositions::build_unchecked(&positions, 5000);
assert_eq!(ap.get(0), Some(0));
assert_eq!(ap.get(256), Some(1280)); assert_eq!(ap.get(512), Some(2560)); assert_eq!(ap.get(999), Some(4995));
let mut cursor = ap.cursor();
for &expected in &positions {
assert_eq!(cursor.current(), Some(expected));
cursor.advance_one();
}
}
#[test]
fn test_find_last_open_at_text_pos() {
let positions = vec![0, 0, 10, 10, 10, 20];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.find_last_open_at_text_pos(0), Some(1));
assert_eq!(ap.find_last_open_at_text_pos(10), Some(4));
assert_eq!(ap.find_last_open_at_text_pos(20), Some(5));
assert_eq!(ap.find_last_open_at_text_pos(5), None);
assert_eq!(ap.find_last_open_at_text_pos(100), None);
}
#[test]
fn test_find_last_open_at_text_pos_no_duplicates() {
let positions = vec![0, 10, 20, 30];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.find_last_open_at_text_pos(0), Some(0));
assert_eq!(ap.find_last_open_at_text_pos(10), Some(1));
assert_eq!(ap.find_last_open_at_text_pos(20), Some(2));
assert_eq!(ap.find_last_open_at_text_pos(30), Some(3));
assert_eq!(ap.find_last_open_at_text_pos(15), None);
}
#[test]
fn test_find_last_open_at_text_pos_all_duplicates() {
let positions = vec![5, 5, 5, 5, 5];
let ap = AdvancePositions::build_unchecked(&positions, 100);
assert_eq!(ap.find_last_open_at_text_pos(5), Some(4));
assert_eq!(ap.find_last_open_at_text_pos(0), None);
}
}