pub(crate) const RANK_STRIDE: usize = 1024;
#[derive(Debug, Clone)]
pub struct SampledSuffixArray {
rate: usize,
bwt_len: usize,
marks: Vec<u64>,
superblocks: Vec<u32>,
values: Vec<u32>,
}
impl SampledSuffixArray {
pub fn from_sorted_samples(
bwt_len: usize,
rate: usize,
samples: impl Iterator<Item = (usize, u32)>,
) -> Self {
let words = bwt_len.div_ceil(64);
let mut marks = vec![0u64; words];
let mut values = Vec::new();
#[cfg(debug_assertions)]
let mut prev: Option<usize> = None;
for (pos, val) in samples {
debug_assert!(pos < bwt_len, "sample position out of range");
#[cfg(debug_assertions)]
{
debug_assert!(
prev.is_none_or(|p| pos > p),
"samples must be yielded in strictly ascending BWT-position order"
);
prev = Some(pos);
}
marks[pos / 64] |= 1u64 << (pos % 64);
values.push(val);
}
let superblocks = build_superblocks(&marks, bwt_len, RANK_STRIDE);
Self {
rate: rate.max(1),
bwt_len,
marks,
superblocks,
values,
}
}
pub fn rate(&self) -> usize {
self.rate
}
pub fn len(&self) -> usize {
self.bwt_len
}
pub fn is_empty(&self) -> bool {
self.bwt_len == 0
}
pub fn num_samples(&self) -> usize {
self.values.len()
}
pub(crate) fn marks(&self) -> &[u64] {
&self.marks
}
pub(crate) fn superblocks(&self) -> &[u32] {
&self.superblocks
}
pub(crate) fn values(&self) -> &[u32] {
&self.values
}
pub fn sample_at(&self, bwt_idx: usize) -> Option<u32> {
debug_assert!(bwt_idx < self.bwt_len, "bwt_idx out of range");
let word = self.marks[bwt_idx / 64];
if word & (1u64 << (bwt_idx % 64)) == 0 {
return None;
}
let sb = bwt_idx / RANK_STRIDE;
let rank = self.superblocks[sb] + popcount_prefix(&self.marks, sb * RANK_STRIDE, bwt_idx);
Some(self.values[rank as usize])
}
}
fn popcount_prefix(words: &[u64], start: usize, end: usize) -> u32 {
if end <= start {
return 0;
}
let start_word = start / 64;
let end_word = (end - 1) / 64;
let start_bit = start % 64;
let end_bit = end % 64;
if start_word == end_word {
let mut w = words[start_word];
w &= !((1u64 << start_bit) - 1);
if end_bit != 0 {
w &= (1u64 << end_bit) - 1;
}
return w.count_ones();
}
let mut count = (words[start_word] & !((1u64 << start_bit) - 1)).count_ones();
for w in &words[start_word + 1..end_word] {
count += w.count_ones();
}
let mut last = words[end_word];
if end_bit != 0 {
last &= (1u64 << end_bit) - 1;
}
count + last.count_ones()
}
fn build_superblocks(marks: &[u64], bwt_len: usize, stride: usize) -> Vec<u32> {
let num = bwt_len.div_ceil(stride);
let mut sb = Vec::with_capacity(num + 1);
sb.push(0u32);
let mut acc = 0u32;
for i in 0..num {
let start = i * stride;
let end = ((i + 1) * stride).min(bwt_len);
acc += popcount_prefix(marks, start, end);
sb.push(acc);
}
sb
}
#[cfg(test)]
mod tests {
use super::*;
fn fixture() -> SampledSuffixArray {
SampledSuffixArray::from_sorted_samples(
10,
3,
[(0usize, 9u32), (3, 6), (6, 3), (9, 0)].into_iter(),
)
}
#[test]
fn sample_at_returns_value_at_sampled_positions() {
let s = fixture();
assert_eq!(s.sample_at(0), Some(9));
assert_eq!(s.sample_at(3), Some(6));
assert_eq!(s.sample_at(6), Some(3));
assert_eq!(s.sample_at(9), Some(0));
}
#[test]
fn sample_at_returns_none_at_unsampled_positions() {
let s = fixture();
for i in [1usize, 2, 4, 5, 7, 8] {
assert_eq!(s.sample_at(i), None, "position {i} should be unsampled");
}
}
#[test]
fn metadata_is_correct() {
let s = fixture();
assert_eq!(s.rate(), 3);
assert_eq!(s.len(), 10);
assert_eq!(s.num_samples(), 4);
assert!(!s.is_empty());
}
#[test]
fn handles_a_superblock_boundary() {
let n = RANK_STRIDE * 2 + 5;
let samples: Vec<(usize, u32)> = (0..n).step_by(64).map(|p| (p, p as u32)).collect();
let s = SampledSuffixArray::from_sorted_samples(n, 64, samples.iter().copied());
for &(p, v) in &samples {
assert_eq!(s.sample_at(p), Some(v));
}
assert_eq!(s.sample_at(1), None);
assert_eq!(s.sample_at(RANK_STRIDE + 1), None);
}
#[test]
fn empty_has_no_samples() {
let s = SampledSuffixArray::from_sorted_samples(0, 1, std::iter::empty());
assert!(s.is_empty());
assert_eq!(s.num_samples(), 0);
}
#[test]
fn rank_is_correct_for_unaligned_positions_in_a_higher_superblock() {
let n = RANK_STRIDE * 2;
let samples = [(3usize, 100u32), (67, 101), (1027, 102), (2000, 103)];
let s = SampledSuffixArray::from_sorted_samples(n, 64, samples.into_iter());
assert_eq!(s.sample_at(3), Some(100));
assert_eq!(s.sample_at(67), Some(101));
assert_eq!(s.sample_at(1027), Some(102)); assert_eq!(s.sample_at(2000), Some(103));
assert_eq!(s.sample_at(1026), None);
assert_eq!(s.sample_at(1028), None);
}
}