#[cfg(not(test))]
use alloc::boxed::Box;
#[cfg(not(test))]
use alloc::vec;
#[cfg(not(test))]
use alloc::vec::Vec;
use core::cell::Cell;
use crate::util::broadword::select_in_word;
use super::advance_positions::{build_cumulative_rank, build_select_samples, SELECT_SAMPLE_RATE};
#[derive(Clone, Copy, Debug)]
struct SequentialCursor {
next_open_idx: usize,
adv_cumulative: usize,
ib_word_idx: usize,
ib_ones_before: usize,
last_ib_arg: usize,
last_ib_result: usize,
}
impl Default for SequentialCursor {
fn default() -> Self {
Self {
next_open_idx: 0,
adv_cumulative: 0,
ib_word_idx: 0,
ib_ones_before: 0,
last_ib_arg: usize::MAX,
last_ib_result: 0,
}
}
}
#[derive(Clone, Debug)]
pub enum EndPositions {
Compact(Box<CompactEndPositions>),
Dense(Vec<u32>),
}
#[derive(Clone, Debug)]
pub struct CompactEndPositions {
ib_words: Vec<u64>,
#[allow(dead_code)]
ib_len: usize,
ib_select_samples: Vec<u32>,
ib_ones: usize,
advance_words: Vec<u64>,
num_opens: usize,
advance_rank: Vec<u32>,
cursor: Cell<SequentialCursor>,
}
impl EndPositions {
pub fn build(positions: &[u32], text_len: usize) -> Self {
if positions.is_empty() {
return EndPositions::Compact(Box::new(CompactEndPositions::empty(text_len)));
}
match CompactEndPositions::try_build(positions, text_len) {
Some(compact) => EndPositions::Compact(Box::new(compact)),
None => EndPositions::Dense(positions.to_vec()),
}
}
#[inline]
pub fn get(&self, open_idx: usize) -> Option<usize> {
match self {
EndPositions::Compact(c) => c.get(open_idx),
EndPositions::Dense(v) => v
.get(open_idx)
.map(|&pos| pos as usize)
.filter(|&pos| pos > 0),
}
}
#[cfg(test)]
pub fn heap_size(&self) -> usize {
match self {
EndPositions::Compact(c) => c.heap_size(),
EndPositions::Dense(v) => v.len() * 4,
}
}
#[cfg(test)]
#[inline]
pub fn is_compact(&self) -> bool {
matches!(self, EndPositions::Compact(_))
}
}
impl CompactEndPositions {
fn empty(text_len: usize) -> Self {
Self {
ib_words: Vec::new(),
ib_len: text_len,
ib_select_samples: Vec::new(),
ib_ones: 0,
advance_words: Vec::new(),
num_opens: 0,
advance_rank: vec![0],
cursor: Cell::default(),
}
}
fn try_build(positions: &[u32], text_len: usize) -> Option<Self> {
let num_opens = positions.len();
let ib_num_words = (text_len + 1).div_ceil(64);
let mut ib_words = vec![0u64; ib_num_words];
let advance_num_words = num_opens.div_ceil(64);
let mut advance_words = vec![0u64; advance_num_words];
let mut prev_effective: Option<u32> = None;
let mut prev_nonzero = 0u32;
let mut ib_ones = 0usize;
for (i, &pos) in positions.iter().enumerate() {
let effective = if pos > 0 {
if prev_nonzero > 0 && pos < prev_nonzero {
return None;
}
prev_nonzero = pos;
pos
} else {
prev_nonzero
};
if effective == 0 {
prev_effective = Some(0);
continue;
}
let is_new_position = prev_effective != Some(effective);
if is_new_position {
let word_idx = effective as usize / 64;
let bit_idx = effective as usize % 64;
if word_idx < ib_words.len() && (ib_words[word_idx] >> bit_idx) & 1 == 0 {
ib_words[word_idx] |= 1u64 << bit_idx;
ib_ones += 1;
}
advance_words[i / 64] |= 1u64 << (i % 64);
}
prev_effective = Some(effective);
}
if ib_ones == 0 {
return Some(Self::empty(text_len));
}
let advance_rank = build_cumulative_rank(&advance_words);
let ib_select_samples = build_select_samples(&ib_words, ib_ones);
Some(Self {
ib_words,
ib_len: text_len,
ib_select_samples,
ib_ones,
advance_words,
num_opens,
advance_rank,
cursor: Cell::default(),
})
}
#[inline(always)]
pub fn get(&self, open_idx: usize) -> Option<usize> {
let cursor = self.cursor.get();
if open_idx == cursor.next_open_idx {
self.get_sequential(open_idx, cursor)
} else if open_idx > cursor.next_open_idx {
let mut cursor = cursor;
self.advance_cursor_to(&mut cursor, open_idx);
self.get_sequential(open_idx, cursor)
} else {
self.get_random(open_idx)
}
}
#[inline]
fn advance_cursor_to(&self, cursor: &mut SequentialCursor, target: usize) {
cursor.adv_cumulative = self.advance_rank1(target);
cursor.next_open_idx = target;
}
#[inline]
fn get_sequential(&self, open_idx: usize, mut cursor: SequentialCursor) -> Option<usize> {
if open_idx >= self.num_opens {
return None;
}
let word_idx = open_idx / 64;
let bit_idx = open_idx % 64;
let adv_bit = if word_idx < self.advance_words.len() {
(self.advance_words[word_idx] >> bit_idx) & 1
} else {
0
};
let advance_count = cursor.adv_cumulative + adv_bit as usize;
cursor.adv_cumulative = advance_count;
cursor.next_open_idx = open_idx + 1;
if advance_count == 0 {
self.cursor.set(cursor);
return None;
}
let k = advance_count - 1;
if k == cursor.last_ib_arg {
self.cursor.set(cursor);
return Some(cursor.last_ib_result);
}
let mut remaining = k - cursor.ib_ones_before;
let mut wi = cursor.ib_word_idx;
while wi < self.ib_words.len() {
let word = self.ib_words[wi];
let ones = word.count_ones() as usize;
if remaining < ones {
let bit_pos = select_in_word(word, remaining as u32) as usize;
let result = wi * 64 + bit_pos;
cursor.ib_ones_before = k - remaining; cursor.ib_word_idx = wi;
cursor.last_ib_arg = k;
cursor.last_ib_result = result;
self.cursor.set(cursor);
return Some(result);
}
remaining -= ones;
wi += 1;
}
self.cursor.set(cursor);
None
}
fn get_random(&self, open_idx: usize) -> Option<usize> {
if open_idx >= self.num_opens {
return None;
}
let advance_count = self.advance_rank1(open_idx + 1);
let result = if advance_count == 0 {
None
} else {
self.ib_select1(advance_count - 1)
};
self.cursor.set(SequentialCursor {
next_open_idx: open_idx + 1,
adv_cumulative: advance_count,
ib_word_idx: 0,
ib_ones_before: 0,
last_ib_arg: usize::MAX,
last_ib_result: 0,
});
result
}
#[inline]
fn advance_rank1(&self, pos: usize) -> usize {
if pos == 0 {
return 0;
}
let word_idx = pos / 64;
let bit_idx = pos % 64;
let mut count = self.advance_rank[word_idx.min(self.advance_words.len())] as usize;
if word_idx < self.advance_words.len() && bit_idx > 0 {
let mask = (1u64 << bit_idx) - 1;
count += (self.advance_words[word_idx] & mask).count_ones() as usize;
}
count
}
#[inline]
fn ib_select1(&self, k: usize) -> Option<usize> {
if k >= self.ib_ones {
return None;
}
let sample_idx = k / SELECT_SAMPLE_RATE;
let (start_word, mut remaining) = if sample_idx < self.ib_select_samples.len() {
let sample_pos = self.ib_select_samples[sample_idx] as usize;
let skip_ones = sample_idx * SELECT_SAMPLE_RATE;
(sample_pos / 64, k - skip_ones)
} else {
(0, k)
};
for word_idx in start_word..self.ib_words.len() {
let word = if word_idx == start_word && sample_idx < self.ib_select_samples.len() {
let sample_pos = self.ib_select_samples[sample_idx] as usize;
let bit_offset = sample_pos % 64;
self.ib_words[word_idx] & !((1u64 << bit_offset) - 1)
} else {
self.ib_words[word_idx]
};
let ones = word.count_ones() as usize;
if ones > remaining {
let bit_pos = select_in_word(word, remaining as u32) as usize;
return Some(word_idx * 64 + bit_pos);
}
remaining -= ones;
}
None
}
#[cfg(test)]
pub fn heap_size(&self) -> usize {
self.ib_words.len() * 8
+ self.ib_select_samples.len() * 4
+ self.advance_words.len() * 8
+ self.advance_rank.len() * 4
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty() {
let ep = EndPositions::build(&[], 0);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None);
}
#[test]
fn test_all_zeros() {
let positions = vec![0, 0, 0, 0, 0];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
for i in 0..5 {
assert_eq!(ep.get(i), None, "get({}) should be None", i);
}
}
#[test]
fn test_simple_scalars() {
let positions = vec![0, 10, 20, 0, 30];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(10)); assert_eq!(ep.get(2), Some(20)); assert_eq!(ep.get(3), Some(20));
assert_eq!(ep.get(4), Some(30)); assert_eq!(ep.get(5), None); }
#[test]
fn test_duplicate_end_positions() {
let positions = vec![0, 10, 10, 20, 20, 20];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(10));
assert_eq!(ep.get(2), Some(10));
assert_eq!(ep.get(3), Some(20));
assert_eq!(ep.get(4), Some(20));
assert_eq!(ep.get(5), Some(20));
}
#[test]
fn test_non_monotonic_falls_back_to_dense() {
let positions = vec![0, 20, 10, 0, 30];
let ep = EndPositions::build(&positions, 100);
assert!(!ep.is_compact());
assert_eq!(ep.get(0), None);
assert_eq!(ep.get(1), Some(20));
assert_eq!(ep.get(2), Some(10));
assert_eq!(ep.get(3), None);
assert_eq!(ep.get(4), Some(30));
}
#[test]
fn test_all_scalars() {
let positions = vec![5, 10, 15, 20, 25];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
for (i, &expected) in positions.iter().enumerate() {
assert_eq!(ep.get(i), Some(expected as usize), "get({}) failed", i);
}
}
#[test]
fn test_single_scalar() {
let positions = vec![0, 42, 0];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(42)); assert_eq!(ep.get(2), Some(42));
}
#[test]
fn test_memory_savings() {
let mut positions = Vec::with_capacity(1000);
let mut pos = 5u32;
for i in 0..1000 {
if i % 5 < 2 {
positions.push(0);
} else {
positions.push(pos);
pos += 7 + (i as u32 % 15);
}
}
let text_len = pos as usize + 100;
let ep = EndPositions::build(&positions, text_len);
assert!(ep.is_compact());
let vec_size = positions.len() * 4;
let ep_size = ep.heap_size();
eprintln!(
"Memory: Vec<u32>={} bytes, EndPositions={} bytes ({:.2}x compression)",
vec_size,
ep_size,
vec_size as f64 / ep_size as f64
);
assert!(
ep_size < vec_size,
"EndPositions {} should be smaller than Vec<u32> {}",
ep_size,
vec_size
);
for (i, &expected) in positions.iter().enumerate() {
let got = ep.get(i);
if expected > 0 {
assert_eq!(
got,
Some(expected as usize),
"get({}) failed: expected {}, got {:?}",
i,
expected,
got
);
}
}
}
#[test]
fn test_realistic_yaml_pattern() {
let positions = vec![0, 5, 11, 15, 22];
let ep = EndPositions::build(&positions, 30);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(5));
assert_eq!(ep.get(2), Some(11));
assert_eq!(ep.get(3), Some(15));
assert_eq!(ep.get(4), Some(22));
}
#[test]
fn test_large_positions() {
let positions = vec![0, 100_000, 200_000, 0, 300_000];
let ep = EndPositions::build(&positions, 400_000);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(100_000));
assert_eq!(ep.get(2), Some(200_000));
assert_eq!(ep.get(3), Some(200_000));
assert_eq!(ep.get(4), Some(300_000));
}
#[test]
fn test_many_elements_with_samples() {
let mut positions = Vec::with_capacity(1000);
let mut pos = 1u32;
for i in 0..1000 {
if i % 3 == 0 {
positions.push(0); } else {
positions.push(pos);
pos += 5;
}
}
let text_len = pos as usize + 100;
let ep = EndPositions::build(&positions, text_len);
assert!(ep.is_compact());
for (i, &expected) in positions.iter().enumerate() {
let got = ep.get(i);
if expected > 0 {
assert_eq!(got, Some(expected as usize), "get({}) failed", i);
}
}
}
#[test]
fn test_real_yaml_compression() {
let mut yaml = String::new();
for i in 0..500 {
yaml.push_str(&format!("key_{}: value_{}\n", i, i));
}
yaml.push_str("nested:\n");
for i in 0..200 {
yaml.push_str(&format!(" sub_{}: sub_val_{}\n", i, i));
}
yaml.push_str("list:\n");
for i in 0..300 {
yaml.push_str(&format!(" - item_{}\n", i));
}
let semi = super::super::parser::build_semi_index(yaml.as_bytes()).unwrap();
let text_len = yaml.len();
let dense_size = semi.bp_to_text_end.len() * 4;
let ep = EndPositions::build(&semi.bp_to_text_end, text_len);
let compact_size = ep.heap_size();
let num_opens = semi.bp_to_text_end.len();
let num_nonzero = semi.bp_to_text_end.iter().filter(|&&v| v > 0).count();
eprintln!(
"Real YAML ({} bytes, {} opens, {} scalars): Vec<u32>={} bytes, EndPositions={} bytes ({:.2}x compression)",
text_len, num_opens, num_nonzero, dense_size, compact_size,
dense_size as f64 / compact_size as f64
);
assert!(ep.is_compact(), "Real YAML should use compact encoding");
assert!(compact_size < dense_size, "Compact should be smaller");
for (i, &expected) in semi.bp_to_text_end.iter().enumerate() {
let got = ep.get(i);
if expected > 0 {
assert_eq!(got, Some(expected as usize), "open {} mismatch", i);
}
}
}
#[test]
fn test_end_position_at_text_len() {
let positions = vec![0, 64];
let ep = EndPositions::build(&positions, 64);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(64));
}
#[test]
fn test_end_position_at_text_len_multiple_of_64() {
let positions = vec![0, 50, 100, 0, 192];
let ep = EndPositions::build(&positions, 192);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(50));
assert_eq!(ep.get(2), Some(100));
assert_eq!(ep.get(3), Some(100));
assert_eq!(ep.get(4), Some(192));
}
#[test]
fn test_containers_between_scalars_return_previous() {
let positions = vec![0, 10, 0, 0, 20, 0, 30];
let ep = EndPositions::build(&positions, 100);
assert!(ep.is_compact());
assert_eq!(ep.get(0), None); assert_eq!(ep.get(1), Some(10)); assert_eq!(ep.get(2), Some(10)); assert_eq!(ep.get(3), Some(10)); assert_eq!(ep.get(4), Some(20)); assert_eq!(ep.get(5), Some(20)); assert_eq!(ep.get(6), Some(30)); }
#[test]
fn test_random_access_after_sequential() {
let positions = vec![5, 10, 15, 20, 25];
let ep = EndPositions::build(&positions, 100);
assert_eq!(ep.get(0), Some(5));
assert_eq!(ep.get(1), Some(10));
assert_eq!(ep.get(2), Some(15));
assert_eq!(ep.get(0), Some(5));
assert_eq!(ep.get(1), Some(10));
assert_eq!(ep.get(3), Some(20));
assert_eq!(ep.get(4), Some(25));
}
}