use epserde::prelude::*;
#[derive(Clone, Copy, Debug)]
pub struct DecodedOffset {
pub absolute_offset: u64,
pub relative_offset: u64,
}
impl DecodedOffset {
pub fn new(absolute_offset: u64, relative_offset: u64) -> Self {
Self {
absolute_offset,
relative_offset,
}
}
}
#[derive(Clone, Debug, Epserde)]
pub struct OffsetsVector {
offsets: Vec<u64>,
}
impl OffsetsVector {
pub fn new() -> Self {
Self {
offsets: vec![0], }
}
#[inline]
pub fn push(&mut self, offset: u64) {
self.offsets.push(offset);
}
#[inline]
pub fn access(&self, i: usize) -> u64 {
assert!(i < self.offsets.len(), "Offset index {} out of bounds", i);
self.offsets[i]
}
#[inline]
pub fn decode(&self, absolute_offset: u64) -> DecodedOffset {
DecodedOffset::new(absolute_offset, absolute_offset)
}
#[inline]
pub fn len(&self) -> usize {
self.offsets.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.offsets.is_empty()
}
#[inline]
pub fn num_bytes(&self) -> u64 {
(self.offsets.len() * 8) as u64
}
#[inline]
pub fn num_bits(&self) -> u64 {
(self.offsets.len() * 64) as u64
}
#[inline]
pub fn locate(&self, pos: u64) -> Option<(u64, u64)> {
let n = self.offsets.len();
if n < 2 {
return None;
}
let idx = self.offsets.partition_point(|&x| x <= pos);
if idx == 0 {
return None;
}
let string_id = idx - 1;
if string_id + 1 < n {
Some((string_id as u64, self.offsets[string_id]))
} else {
None
}
}
#[inline]
pub fn locate_branchless(&self, pos: u64) -> Option<(u64, u64)> {
let n = self.offsets.len();
if n < 2 {
return None;
}
let data = self.offsets.as_slice();
let mut lo: usize = 0;
let mut size = n;
while size > 1 {
let half = size / 2;
let mid = lo + half;
lo = if unsafe { *data.get_unchecked(mid) } <= pos { mid } else { lo };
size -= half;
}
if unsafe { *data.get_unchecked(lo) } > pos {
return None;
}
if lo + 1 < n {
Some((lo as u64, unsafe { *data.get_unchecked(lo) }))
} else {
None
}
}
}
impl Default for OffsetsVector {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_offsets_vector_creation() {
let offsets = OffsetsVector::new();
assert_eq!(offsets.len(), 1);
assert_eq!(offsets.access(0), 0);
}
#[test]
fn test_offsets_vector_push() {
let mut offsets = OffsetsVector::new();
offsets.push(100);
offsets.push(200);
offsets.push(300);
assert_eq!(offsets.len(), 4);
assert_eq!(offsets.access(0), 0);
assert_eq!(offsets.access(1), 100);
assert_eq!(offsets.access(2), 200);
assert_eq!(offsets.access(3), 300);
}
#[test]
fn test_offsets_vector_decode() {
let offsets = OffsetsVector::new();
let decoded = offsets.decode(50);
assert_eq!(decoded.absolute_offset, 50);
assert_eq!(decoded.relative_offset, 50);
}
#[test]
fn test_decoded_offset_creation() {
let decoded = DecodedOffset::new(1000, 500);
assert_eq!(decoded.absolute_offset, 1000);
assert_eq!(decoded.relative_offset, 500);
}
}
use sux::dict::elias_fano::{EliasFanoBuilder, EfSeqDict};
use sux::traits::{IndexedSeq, Succ};
use sux::traits::iter::BidiIterator;
use mem_dbg::{MemSize, SizeFlags};
pub struct EliasFanoOffsets {
ef: EfSeqDict,
}
impl EliasFanoOffsets {
pub fn from_vec(offsets: &[u64]) -> Self {
let n = offsets.len();
let u = if n > 0 { offsets[n - 1] as usize + 1 } else { 1 };
let mut builder = EliasFanoBuilder::new(n, u);
for &v in offsets {
builder.push(v as usize);
}
Self { ef: builder.build_with_seq_and_dict() }
}
pub fn from_offsets_vector(ov: OffsetsVector) -> Self {
Self::from_vec(&ov.offsets)
}
#[inline]
pub fn access(&self, i: usize) -> u64 {
unsafe { self.ef.get_unchecked(i) as u64 }
}
#[inline]
pub fn locate_with_end(&self, pos: u64) -> Option<(u64, u64, u64)> {
let n = self.ef.len();
if n < 2 {
return None;
}
let (idx, mut iter) = self.ef.iter_bidi_from_succ(pos as usize)?;
let val = iter.next()?;
if val == pos as usize {
if idx + 1 < n {
let end = iter.next()? as u64;
Some((idx as u64, pos, end))
} else {
None
}
} else {
debug_assert!(idx > 0);
iter.prev(); let begin = iter.prev()? as u64; Some(((idx - 1) as u64, begin, val as u64))
}
}
#[inline]
pub fn locate(&self, pos: u64) -> Option<(u64, u64)> {
let n = self.ef.len();
if n < 2 {
return None;
}
let (idx, mut iter) = self.ef.iter_bidi_from_succ(pos as usize)?;
let val = iter.next()?;
if val == pos as usize {
if idx + 1 < n {
Some((idx as u64, pos))
} else {
None
}
} else {
debug_assert!(idx > 0);
iter.prev(); let string_begin = iter.prev()? as u64; Some(((idx - 1) as u64, string_begin))
}
}
#[inline]
pub fn len(&self) -> usize {
self.ef.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.ef.len() == 0
}
#[inline]
pub fn num_bytes(&self) -> u64 {
self.ef.mem_size(SizeFlags::default()) as u64
}
#[inline]
pub fn num_bits(&self) -> u64 {
self.num_bytes() * 8
}
pub fn write_to<W: std::io::Write>(&self, writer: &mut W) -> std::io::Result<()> {
unsafe {
self.ef.serialize(writer)
.map_err(std::io::Error::other)?;
}
Ok(())
}
pub fn read_from<R: std::io::Read>(reader: &mut R) -> std::io::Result<Self> {
let ef = unsafe {
EfSeqDict::deserialize_full(reader)
.map_err(std::io::Error::other)?
};
Ok(Self { ef })
}
}
impl std::fmt::Debug for EliasFanoOffsets {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EliasFanoOffsets")
.field("len", &self.ef.len())
.finish()
}
}
#[cfg(test)]
mod ef_tests {
use super::*;
#[test]
fn test_ef_from_vec() {
let offsets = vec![0, 100, 200, 300, 400];
let ef = EliasFanoOffsets::from_vec(&offsets);
assert_eq!(ef.len(), 5);
assert_eq!(ef.access(0), 0);
assert_eq!(ef.access(1), 100);
assert_eq!(ef.access(2), 200);
assert_eq!(ef.access(3), 300);
assert_eq!(ef.access(4), 400);
}
#[test]
fn test_ef_locate() {
let offsets = vec![0, 100, 200, 300, 400];
let ef = EliasFanoOffsets::from_vec(&offsets);
assert_eq!(ef.locate(50), Some((0, 0)));
assert_eq!(ef.locate(100), Some((1, 100)));
assert_eq!(ef.locate(199), Some((1, 100)));
assert_eq!(ef.locate(300), Some((3, 300)));
assert_eq!(ef.locate(399), Some((3, 300)));
assert_eq!(ef.locate(400), None);
}
#[test]
fn test_ef_locate_with_end() {
let offsets = vec![0, 100, 200, 300, 400];
let ef = EliasFanoOffsets::from_vec(&offsets);
assert_eq!(ef.locate_with_end(50), Some((0, 0, 100)));
assert_eq!(ef.locate_with_end(100), Some((1, 100, 200)));
assert_eq!(ef.locate_with_end(199), Some((1, 100, 200)));
assert_eq!(ef.locate_with_end(300), Some((3, 300, 400)));
assert_eq!(ef.locate_with_end(399), Some((3, 300, 400)));
assert_eq!(ef.locate_with_end(400), None);
}
#[test]
fn test_ef_serialization_roundtrip() {
let offsets = vec![0, 100, 200, 300, 400];
let ef = EliasFanoOffsets::from_vec(&offsets);
let mut buf = Vec::new();
ef.write_to(&mut buf).unwrap();
let ef2 = EliasFanoOffsets::read_from(&mut &buf[..]).unwrap();
assert_eq!(ef2.len(), 5);
for i in 0..5 {
assert_eq!(ef.access(i), ef2.access(i));
}
assert_eq!(ef2.locate(150), Some((1, 100)));
assert_eq!(ef2.locate_with_end(150), Some((1, 100, 200)));
}
#[test]
fn test_ef_locate_with_end_stress() {
let mut offsets = vec![0u64];
let gaps = [3, 7, 1, 15, 2, 100, 5, 31, 8, 63, 4, 127, 6, 255, 10, 50,
1, 1, 1, 33, 200, 9, 17, 3, 11, 500, 2, 7, 13, 41];
for &g in gaps.iter() {
offsets.push(offsets.last().unwrap() + g);
}
let ef = EliasFanoOffsets::from_vec(&offsets);
for (i, &v) in offsets.iter().enumerate() {
assert_eq!(ef.access(i), v, "access({i}) mismatch");
}
let universe = *offsets.last().unwrap();
for pos in 0..=universe {
let expected = {
let mut found = None;
for i in 0..offsets.len() - 1 {
if offsets[i] <= pos && pos < offsets[i + 1] {
found = Some((i as u64, offsets[i], offsets[i + 1]));
break;
}
}
found
};
let got = ef.locate_with_end(pos);
assert_eq!(got, expected, "locate_with_end({pos}) mismatch");
}
assert_eq!(ef.locate_with_end(universe), None);
assert_eq!(ef.locate_with_end(universe + 1), None);
}
#[test]
fn test_ef_locate_large_universe() {
let offsets: Vec<u64> = vec![0, 1000, 5000, 5001, 5002, 10000, 100000, 100001, 500000];
let ef = EliasFanoOffsets::from_vec(&offsets);
let test_positions: Vec<u64> = {
let mut v = Vec::new();
for &off in &offsets {
if off > 0 { v.push(off - 1); }
v.push(off);
v.push(off + 1);
}
v.extend_from_slice(&[500, 3000, 5000, 7500, 50000, 200000, 400000]);
v.sort();
v.dedup();
v
};
for pos in test_positions {
let expected = {
let mut found = None;
for i in 0..offsets.len() - 1 {
if offsets[i] <= pos && pos < offsets[i + 1] {
found = Some((i as u64, offsets[i], offsets[i + 1]));
break;
}
}
found
};
let got = ef.locate_with_end(pos);
assert_eq!(got, expected, "locate_with_end({pos}) mismatch");
}
}
}