use crate::core::{ContigSet, Locus};
use crate::genomics::fm_backing::{self, BwtBacking};
use crate::genomics::index::format::{SectionEntry, SectionKind};
use crate::genomics::index::io::IndexIoError;
use crate::genomics::{popcount_range, BaseCode, FMInterval, FmSymbol, RANK_STRIDE};
#[derive(Debug)]
pub struct FmIndexView<'a> {
bwt_len: usize,
block_size: usize,
num_blocks: usize,
sentinel_pos: usize,
sample_rate: usize,
c_table: [u32; 6],
boundaries: &'a [u32],
block_dir: &'a [u64],
blocks: &'a [u8],
sampled: SampledView<'a>,
}
#[derive(Debug)]
struct SampledView<'a> {
bwt_len: usize,
marks: &'a [u64],
superblocks: &'a [u32],
values: &'a [u32],
}
struct BlockView<'a> {
start: usize,
end: usize,
sentinel_offset: Option<usize>,
stride: usize,
bwt_data: &'a [u64],
bwt_amb: &'a [u64],
occ_bitvecs: [&'a [u64]; 5],
occ_superblocks: [&'a [u32]; 5],
}
impl<'a> FmIndexView<'a> {
pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
if cfg!(target_endian = "big") {
return Err(IndexIoError::Invalid(
"zero-copy index view requires a little-endian host".to_string(),
));
}
let fm_meta = section_bytes(bytes, sections, SectionKind::FmMeta)?;
let mut o = 0usize;
let block_size = read_u64(fm_meta, &mut o)? as usize;
let bwt_len = read_u64(fm_meta, &mut o)? as usize;
let sentinel_pos = read_u64(fm_meta, &mut o)? as usize;
let sample_rate = read_u64(fm_meta, &mut o)? as usize;
let num_blocks = read_u64(fm_meta, &mut o)? as usize;
let mut c_table = [0u32; 6];
for slot in &mut c_table {
*slot = read_u32(fm_meta, &mut o)?;
}
let boundaries_bytes = section_bytes(bytes, sections, SectionKind::Boundaries)?;
let boundaries = as_u32_slice(boundaries_bytes)?;
if boundaries.len() != 6 * (num_blocks + 1) {
return Err(IndexIoError::Invalid(
"boundaries length mismatch".to_string(),
));
}
let blocks = section_bytes(bytes, sections, SectionKind::Blocks)?;
let dir_bytes = num_blocks
.checked_mul(8)
.ok_or_else(|| IndexIoError::Invalid("block directory overflow".to_string()))?;
if dir_bytes > blocks.len() {
return Err(IndexIoError::Invalid(
"block directory out of bounds".to_string(),
));
}
let block_dir = as_u64_slice(&blocks[..dir_bytes])?;
validate_block_records(blocks, block_dir)?;
let sa = section_bytes(bytes, sections, SectionKind::SaSamples)?;
let mut so = 0usize;
let rate = read_u64(sa, &mut so)? as usize;
if rate != sample_rate {
return Err(IndexIoError::Invalid(format!(
"SaSamples rate {rate} != FmMeta sa_sample_rate {sample_rate}"
)));
}
let s_bwt_len = read_u64(sa, &mut so)? as usize;
let marks_words = read_u64(sa, &mut so)? as usize;
let sb_len = read_u64(sa, &mut so)? as usize;
let values_len = read_u64(sa, &mut so)? as usize;
let marks = as_u64_slice(slice_exact(sa, &mut so, marks_words * 8)?)?;
let superblocks = as_u32_slice(slice_exact(sa, &mut so, sb_len * 4)?)?;
let values = as_u32_slice(slice_exact(sa, &mut so, values_len * 4)?)?;
Ok(Self {
bwt_len,
block_size,
num_blocks,
sentinel_pos,
sample_rate,
c_table,
boundaries,
block_dir,
blocks,
sampled: SampledView {
bwt_len: s_bwt_len,
marks,
superblocks,
values,
},
})
}
pub fn backward_search(&self, pattern: &[u8]) -> FMInterval {
fm_backing::backward_search(self, pattern)
}
pub fn sa_at(&self, index: usize) -> usize {
fm_backing::sa_at(self, index)
}
pub fn locate_interval(&self, interval: FMInterval, max_hits: usize) -> Vec<u32> {
fm_backing::locate_interval(self, interval, max_hits)
}
fn block(&self, block_idx: usize) -> BlockView<'a> {
let rec = self.block_dir[block_idx] as usize;
let s = &self.blocks[rec..];
let mut o = 0usize;
let start = read_u64_infallible(s, &mut o) as usize;
let end = read_u64_infallible(s, &mut o) as usize;
let sentinel_raw = read_i64_infallible(s, &mut o);
let stride = read_u64_infallible(s, &mut o) as usize;
let bwt_data_words = read_u64_infallible(s, &mut o) as usize;
let bwt_amb_words = read_u64_infallible(s, &mut o) as usize;
let occ_bitvec_words = read_u64_infallible(s, &mut o) as usize;
let occ_superblock_len = read_u64_infallible(s, &mut o) as usize;
let bwt_data = as_u64_slice(&s[o..o + bwt_data_words * 8]).expect("aligned");
o += bwt_data_words * 8;
let bwt_amb = as_u64_slice(&s[o..o + bwt_amb_words * 8]).expect("aligned");
o += bwt_amb_words * 8;
let mut occ_bitvecs: [&[u64]; 5] = [&[]; 5];
for slot in &mut occ_bitvecs {
*slot = as_u64_slice(&s[o..o + occ_bitvec_words * 8]).expect("aligned");
o += occ_bitvec_words * 8;
}
let mut occ_superblocks: [&[u32]; 5] = [&[]; 5];
for slot in &mut occ_superblocks {
*slot = as_u32_slice(&s[o..o + occ_superblock_len * 4]).expect("aligned");
o += occ_superblock_len * 4;
}
BlockView {
start,
end,
sentinel_offset: if sentinel_raw < 0 {
None
} else {
Some(sentinel_raw as usize)
},
stride,
bwt_data,
bwt_amb,
occ_bitvecs,
occ_superblocks,
}
}
}
impl BwtBacking for FmIndexView<'_> {
fn bwt_len(&self) -> usize {
self.bwt_len
}
fn block_size(&self) -> usize {
self.block_size
}
fn num_blocks(&self) -> usize {
self.num_blocks
}
fn sentinel_pos(&self) -> usize {
self.sentinel_pos
}
fn sample_rate(&self) -> usize {
self.sample_rate
}
fn c_table(&self) -> [u32; 6] {
self.c_table
}
fn boundary_base(&self, block_idx: usize, base_index: usize) -> u32 {
self.boundaries[block_idx * 6 + base_index]
}
fn boundary_sentinel(&self, block_idx: usize) -> u32 {
self.boundaries[block_idx * 6 + 5]
}
fn block_rank(&self, block_idx: usize, symbol: FmSymbol, within: usize) -> u32 {
let block = self.block(block_idx);
let n = block.end - block.start;
let bounded = within.min(n);
match symbol {
FmSymbol::Sentinel => match block.sentinel_offset {
Some(off) if off < bounded => 1,
_ => 0,
},
FmSymbol::Base(code) => {
let bi = code.index();
let sb = bounded / block.stride;
let within_start = sb * block.stride;
let mut count = block.occ_superblocks[bi][sb]
+ popcount_range(block.occ_bitvecs[bi], within_start, bounded);
if code == BaseCode::N {
if let Some(off) = block.sentinel_offset {
if off < bounded {
count = count.saturating_sub(1);
}
}
}
count
}
}
}
fn block_symbol(&self, block_idx: usize, within: usize) -> FmSymbol {
let block = self.block(block_idx);
let amb_word = block.bwt_amb[within / 64];
if amb_word & (1u64 << (within % 64)) != 0 {
return FmSymbol::Base(BaseCode::N);
}
let data_word = block.bwt_data[within / 32];
let code = ((data_word >> ((within % 32) * 2)) & 0b11) as u8;
let base = match code {
0 => BaseCode::A,
1 => BaseCode::C,
2 => BaseCode::G,
_ => BaseCode::T,
};
FmSymbol::Base(base)
}
fn sampled_at(&self, index: usize) -> Option<u32> {
debug_assert!(index < self.sampled.bwt_len);
let word = self.sampled.marks[index / 64];
if word & (1u64 << (index % 64)) == 0 {
return None;
}
let sb = index / RANK_STRIDE;
let rank = self.sampled.superblocks[sb]
+ popcount_range(self.sampled.marks, sb * RANK_STRIDE, index);
Some(self.sampled.values[rank as usize])
}
}
#[derive(Debug)]
pub struct GenomeIndexView<'a> {
fm: FmIndexView<'a>,
contigs: &'a ContigSet,
}
impl<'a> GenomeIndexView<'a> {
pub(crate) fn new(fm: FmIndexView<'a>, contigs: &'a ContigSet) -> Self {
Self { fm, contigs }
}
pub fn fm(&self) -> &FmIndexView<'a> {
&self.fm
}
pub fn contigs(&self) -> &ContigSet {
self.contigs
}
pub fn locate_exact(&self, pattern: &[u8], max_hits: usize) -> Vec<Locus> {
if pattern.is_empty() {
return Vec::new();
}
let pattern: Vec<u8> = pattern.iter().map(|b| b.to_ascii_uppercase()).collect();
let interval = self.fm.backward_search(&pattern);
let len = pattern.len() as u32;
let mut loci: Vec<Locus> = self
.fm
.locate_interval(interval, max_hits)
.into_iter()
.filter_map(|g| self.contigs.resolve_span(g as u64, len))
.collect();
loci.sort_unstable();
loci
}
}
#[derive(Debug)]
pub struct ReferenceView<'a> {
len: usize,
data: &'a [u64],
amb: &'a [u64],
}
impl<'a> ReferenceView<'a> {
pub(crate) fn new(bytes: &'a [u8], sections: &[SectionEntry]) -> Result<Self, IndexIoError> {
if cfg!(target_endian = "big") {
return Err(IndexIoError::Invalid(
"zero-copy reference view requires a little-endian host".to_string(),
));
}
let section = section_bytes(bytes, sections, SectionKind::Reference2bit)?;
let mut o = 0usize;
let len = read_u64(section, &mut o)? as usize;
let data_words = read_u64(section, &mut o)? as usize;
let amb_words = read_u64(section, &mut o)? as usize;
let data = as_u64_slice(slice_exact(section, &mut o, data_words.saturating_mul(8))?)?;
let amb = as_u64_slice(slice_exact(section, &mut o, amb_words.saturating_mul(8))?)?;
if data.len().saturating_mul(32) < len || amb.len().saturating_mul(64) < len {
return Err(IndexIoError::Invalid(
"Reference2bit section too small for the declared length".to_string(),
));
}
Ok(Self { len, data, amb })
}
pub fn len(&self) -> usize {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn base_at(&self, global: usize) -> u8 {
debug_assert!(global < self.len, "reference index out of range");
if self.amb[global / 64] & (1u64 << (global % 64)) != 0 {
return b'N';
}
let code = ((self.data[global / 32] >> ((global % 32) * 2)) & 0b11) as u8;
match code {
0 => b'A',
1 => b'C',
2 => b'G',
_ => b'T',
}
}
pub fn decode_window(&self, start: usize, end: usize, out: &mut Vec<u8>) {
out.clear();
let end = end.min(self.len);
for i in start..end {
out.push(self.base_at(i));
}
}
pub fn decode_window_arc(&self, start: usize, end: usize) -> std::sync::Arc<[u8]> {
let end = end.min(self.len);
(start..end).map(|i| self.base_at(i)).collect()
}
}
fn validate_block_records(blocks: &[u8], block_dir: &[u64]) -> Result<(), IndexIoError> {
const HEADER: usize = 64; for &rec_off in block_dir {
let rec = rec_off as usize;
#[allow(clippy::manual_is_multiple_of)]
if rec % 8 != 0 {
return Err(IndexIoError::Invalid(
"block record offset is not 8-aligned".to_string(),
));
}
let header_end = rec
.checked_add(HEADER)
.filter(|&e| e <= blocks.len())
.ok_or_else(|| {
IndexIoError::Invalid("block record header out of bounds".to_string())
})?;
let mut o = rec + 32;
let bwt_data_words = read_u64(blocks, &mut o)? as usize;
let bwt_amb_words = read_u64(blocks, &mut o)? as usize;
let occ_bitvec_words = read_u64(blocks, &mut o)? as usize;
let occ_superblock_len = read_u64(blocks, &mut o)? as usize;
debug_assert_eq!(o, header_end);
let payload = bwt_data_words
.checked_mul(8)
.and_then(|x| bwt_amb_words.checked_mul(8).and_then(|y| x.checked_add(y)))
.and_then(|x| {
occ_bitvec_words
.checked_mul(8)
.and_then(|y| y.checked_mul(5))
.and_then(|y| x.checked_add(y))
})
.and_then(|x| {
occ_superblock_len
.checked_mul(4)
.and_then(|y| y.checked_mul(5))
.and_then(|y| x.checked_add(y))
})
.ok_or_else(|| IndexIoError::Invalid("block record size overflow".to_string()))?;
let rec_end = header_end
.checked_add(payload)
.filter(|&e| e <= blocks.len())
.ok_or_else(|| {
IndexIoError::Invalid("block record payload out of bounds".to_string())
})?;
let _ = rec_end;
}
Ok(())
}
fn section_bytes<'a>(
bytes: &'a [u8],
sections: &[SectionEntry],
kind: SectionKind,
) -> Result<&'a [u8], IndexIoError> {
let s = sections
.iter()
.find(|s| s.kind == kind)
.ok_or_else(|| IndexIoError::Invalid(format!("missing section {kind:?}")))?;
let start = s.offset as usize;
let end = start + s.bytes as usize;
bytes
.get(start..end)
.ok_or_else(|| IndexIoError::Invalid(format!("section {kind:?} out of bounds")))
}
fn as_u64_slice(bytes: &[u8]) -> Result<&[u64], IndexIoError> {
let (prefix, mid, suffix) = unsafe { bytes.align_to::<u64>() };
if !prefix.is_empty() || !suffix.is_empty() {
return Err(IndexIoError::Invalid("misaligned u64 array".to_string()));
}
Ok(mid)
}
fn as_u32_slice(bytes: &[u8]) -> Result<&[u32], IndexIoError> {
let (prefix, mid, suffix) = unsafe { bytes.align_to::<u32>() };
if !prefix.is_empty() || !suffix.is_empty() {
return Err(IndexIoError::Invalid("misaligned u32 array".to_string()));
}
Ok(mid)
}
fn slice_exact<'a>(
bytes: &'a [u8],
offset: &mut usize,
len: usize,
) -> Result<&'a [u8], IndexIoError> {
let end = offset
.checked_add(len)
.ok_or_else(|| IndexIoError::Invalid("slice overflow".to_string()))?;
let out = bytes
.get(*offset..end)
.ok_or_else(|| IndexIoError::Invalid("slice out of bounds".to_string()))?;
*offset = end;
Ok(out)
}
fn read_u32(bytes: &[u8], offset: &mut usize) -> Result<u32, IndexIoError> {
let end = *offset + 4;
let buf = bytes
.get(*offset..end)
.ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
*offset = end;
Ok(u32::from_le_bytes(buf.try_into().unwrap()))
}
fn read_u64(bytes: &[u8], offset: &mut usize) -> Result<u64, IndexIoError> {
let end = *offset + 8;
let buf = bytes
.get(*offset..end)
.ok_or_else(|| IndexIoError::Invalid("unexpected EOF".to_string()))?;
*offset = end;
Ok(u64::from_le_bytes(buf.try_into().unwrap()))
}
fn read_u64_infallible(bytes: &[u8], offset: &mut usize) -> u64 {
let v = u64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
*offset += 8;
v
}
fn read_i64_infallible(bytes: &[u8], offset: &mut usize) -> i64 {
let v = i64::from_le_bytes(bytes[*offset..*offset + 8].try_into().unwrap());
*offset += 8;
v
}