use super::bgzf::{BgzfError, VirtualOffset};
use super::csi_index::CsiIndex;
use seqair_types::Pos0;
use std::path::Path;
use tracing::instrument;
const PSEUDO_BIN: u32 = 37450;
#[non_exhaustive]
#[derive(Debug, thiserror::Error)]
pub enum BaiError {
#[error("I/O error reading index {path}")]
Read { path: std::path::PathBuf, source: std::io::Error },
#[error("invalid BAI magic bytes (expected BAI\\x01)")]
InvalidMagic,
#[error("BAI/tabix data is truncated")]
Truncated,
#[error("BGZF error decompressing tabix file")]
Bgzf {
#[from]
source: BgzfError,
},
#[error("invalid tabix magic bytes: expected TBI\\x01, got {found:?}")]
InvalidTabixMagic { found: [u8; 4] },
#[error("tabix header is truncated: {reason}")]
TruncatedTabixHeader { reason: &'static str },
#[error("BAI/tabix field has negative count: {value}")]
NegativeCount { value: i32 },
#[error("BAI/tabix field `{field}` count {value} exceeds limit {limit}")]
FieldTooLarge { field: &'static str, value: usize, limit: usize },
}
#[derive(Debug)]
pub struct BamIndex {
references: Vec<RefIndex>,
}
#[derive(Debug)]
struct RefIndex {
bins: Vec<Bin>,
linear_index: Vec<VirtualOffset>,
}
#[derive(Debug)]
struct Bin {
bin_id: u32,
chunks: Vec<Chunk>,
}
#[derive(Debug, Clone, Copy)]
pub struct Chunk {
pub begin: VirtualOffset,
pub end: VirtualOffset,
}
#[derive(Debug, Clone, Copy)]
pub struct AnnotatedChunk {
pub chunk: Chunk,
pub bin_id: u32,
}
#[derive(Debug)]
pub struct QueryChunks {
pub nearby: Vec<Chunk>,
pub distant: Vec<Chunk>,
}
const CACHE_LEVEL_THRESHOLD: u8 = 2;
impl BamIndex {
#[instrument(level = "debug", fields(path = %path.display()))]
pub fn from_path(path: &Path) -> Result<Self, BaiError> {
let data = std::fs::read(path)
.map_err(|source| BaiError::Read { path: path.to_path_buf(), source })?;
if data.len() < 8 {
return Err(BaiError::InvalidMagic);
}
if !data.starts_with(b"BAI\x01") {
return Err(BaiError::InvalidMagic);
}
let mut pos = 4;
parse_refs(&data, &mut pos)
}
#[cfg(feature = "fuzz")]
pub fn from_bytes(data: &[u8]) -> Result<Self, BaiError> {
if data.len() < 8 {
return Err(BaiError::InvalidMagic);
}
if !data.starts_with(b"BAI\x01") {
return Err(BaiError::InvalidMagic);
}
let mut pos = 4;
parse_refs(data, &mut pos)
}
#[cfg(feature = "fuzz")]
pub fn from_tabix_bytes(data: &[u8]) -> Result<Self, BaiError> {
if data.len() < 4 {
return Err(BaiError::TruncatedTabixHeader { reason: "file too short for magic" });
}
if !data.starts_with(b"TBI\x01") {
let mut found = [0u8; 4];
if let Some(src) = data.get(..4) {
found.copy_from_slice(src);
}
return Err(BaiError::InvalidTabixMagic { found });
}
let mut pos = 4;
let n_ref = read_i32(data, &mut pos)?;
let _format = read_i32(data, &mut pos)?;
let _col_seq = read_i32(data, &mut pos)?;
let _col_beg = read_i32(data, &mut pos)?;
let _col_end = read_i32(data, &mut pos)?;
let _meta = read_i32(data, &mut pos)?;
let _skip = read_i32(data, &mut pos)?;
let l_nm = read_i32(data, &mut pos)? as usize;
if data.len() < pos.wrapping_add(l_nm) {
return Err(BaiError::TruncatedTabixHeader { reason: "names extend past end of data" });
}
pos = pos.wrapping_add(l_nm);
parse_refs_with_count(data, &mut pos, n_ref as usize)
}
#[cfg(feature = "fuzz")]
pub fn empty() -> Self {
BamIndex { references: Vec::new() }
}
#[instrument(level = "debug", fields(path = %path.display()))]
pub fn from_tabix_path(path: &Path) -> Result<Self, BaiError> {
let compressed = std::fs::read(path)
.map_err(|source| BaiError::Read { path: path.to_path_buf(), source })?;
let data = decompress_bgzf_file(&compressed)?;
if data.len() < 4 {
return Err(BaiError::TruncatedTabixHeader { reason: "file too short for magic" });
}
if !data.starts_with(b"TBI\x01") {
let mut found = [0u8; 4];
if let Some(src) = data.get(..4) {
found.copy_from_slice(src);
}
return Err(BaiError::InvalidTabixMagic { found });
}
let mut pos = 4;
let n_ref = read_i32(&data, &mut pos)?;
let _format = read_i32(&data, &mut pos)?;
let _col_seq = read_i32(&data, &mut pos)?;
let _col_beg = read_i32(&data, &mut pos)?;
let _col_end = read_i32(&data, &mut pos)?;
let _meta = read_i32(&data, &mut pos)?;
let _skip = read_i32(&data, &mut pos)?;
let l_nm = read_i32(&data, &mut pos)? as usize;
if data.len() < pos.wrapping_add(l_nm) {
return Err(BaiError::TruncatedTabixHeader { reason: "names extend past end of data" });
}
pos = pos.wrapping_add(l_nm);
parse_refs_with_count(&data, &mut pos, n_ref as usize)
}
pub fn query(&self, tid: u32, start: Pos0, end: Pos0) -> Vec<Chunk> {
let result = self.query_split(tid, start, end);
let mut all = result.nearby;
all.extend(result.distant);
all.sort_by_key(|c| c.begin);
merge_overlapping_chunks(&mut all);
all
}
pub fn query_annotated(&self, tid: u32, start: Pos0, end: Pos0) -> Vec<AnnotatedChunk> {
let Some(ref_idx) = self.references.get(tid as usize) else {
return Vec::new();
};
let start_u64 = start.as_u64();
let end_u64 = end.as_u64();
let candidate_bins = reg2bins(start_u64, end_u64.wrapping_add(1));
let linear_min = linear_index_min(ref_idx, start_u64);
let mut result = Vec::new();
for bin in &ref_idx.bins {
if bin.bin_id == PSEUDO_BIN {
continue;
}
if candidate_bins.contains(&bin.bin_id) {
for chunk in &bin.chunks {
if chunk.end > linear_min {
result.push(AnnotatedChunk { chunk: *chunk, bin_id: bin.bin_id });
}
}
}
}
result.sort_by_key(|c| c.chunk.begin);
result
}
pub fn query_split(&self, tid: u32, start: Pos0, end: Pos0) -> QueryChunks {
let Some(ref_idx) = self.references.get(tid as usize) else {
return QueryChunks { nearby: Vec::new(), distant: Vec::new() };
};
let start_u64 = start.as_u64();
let end_u64 = end.as_u64();
let candidate_bins = reg2bins(start_u64, end_u64.wrapping_add(1));
let linear_min = linear_index_min(ref_idx, start_u64);
let mut nearby = Vec::new();
let mut distant = Vec::new();
for bin in &ref_idx.bins {
if bin.bin_id == PSEUDO_BIN {
continue;
}
if candidate_bins.contains(&bin.bin_id) {
let target = if bin_level(bin.bin_id) <= CACHE_LEVEL_THRESHOLD {
&mut distant
} else {
&mut nearby
};
for chunk in &bin.chunks {
if chunk.end > linear_min {
target.push(*chunk);
}
}
}
}
nearby.sort_by_key(|c| c.begin);
merge_overlapping_chunks(&mut nearby);
distant.sort_by_key(|c| c.begin);
merge_overlapping_chunks(&mut distant);
QueryChunks { nearby, distant }
}
}
#[derive(Debug)]
pub enum AlignmentIndex {
Bai(BamIndex),
Csi(CsiIndex),
}
impl AlignmentIndex {
pub fn query(&self, tid: u32, start: Pos0, end: Pos0) -> Vec<Chunk> {
match self {
Self::Bai(idx) => idx.query(tid, start, end),
Self::Csi(idx) => idx.query(tid, start, end),
}
}
pub fn query_annotated(&self, tid: u32, start: Pos0, end: Pos0) -> Vec<AnnotatedChunk> {
match self {
Self::Bai(idx) => idx.query_annotated(tid, start, end),
Self::Csi(idx) => idx.query_annotated(tid, start, end),
}
}
pub fn query_split(&self, tid: u32, start: Pos0, end: Pos0) -> QueryChunks {
match self {
Self::Bai(idx) => idx.query_split(tid, start, end),
Self::Csi(idx) => idx.query_split(tid, start, end),
}
}
}
impl From<BamIndex> for AlignmentIndex {
fn from(idx: BamIndex) -> Self {
Self::Bai(idx)
}
}
impl From<CsiIndex> for AlignmentIndex {
fn from(idx: CsiIndex) -> Self {
Self::Csi(idx)
}
}
pub(super) fn merge_overlapping_chunks(chunks: &mut Vec<Chunk>) {
if chunks.len() <= 1 {
return;
}
let mut write = 0;
for read in 1..chunks.len() {
debug_assert!(read < chunks.len(), "read index OOB: read={read}, len={}", chunks.len());
debug_assert!(write < read, "write must lag behind read: write={write}, read={read}");
#[allow(clippy::indexing_slicing, reason = "read < chunks.len() and write < read")]
if chunks[read].begin <= chunks[write].end {
if chunks[read].end > chunks[write].end {
chunks[write].end = chunks[read].end;
}
} else {
write = write.wrapping_add(1);
chunks[write] = chunks[read];
}
}
chunks.truncate(write.wrapping_add(1));
}
fn parse_refs(data: &[u8], pos: &mut usize) -> Result<BamIndex, BaiError> {
let n_ref = read_i32(data, pos)? as usize;
parse_refs_with_count(data, pos, n_ref)
}
const MAX_INDEX_REFS: usize = 100_000; const MAX_BINS_PER_REF: usize = 100_000; const MAX_CHUNKS_PER_BIN: usize = 1_000_000;
const MAX_LINEAR_INDEX: usize = 500_000;
fn check_index_limit(value: usize, limit: usize, field: &'static str) -> Result<(), BaiError> {
if value > limit {
return Err(BaiError::FieldTooLarge { field, value, limit });
}
Ok(())
}
fn parse_refs_with_count(data: &[u8], pos: &mut usize, n_ref: usize) -> Result<BamIndex, BaiError> {
check_index_limit(n_ref, MAX_INDEX_REFS, "n_ref")?;
let mut references = Vec::with_capacity(n_ref);
for _ in 0..n_ref {
let n_bin = read_i32(data, pos)? as usize;
check_index_limit(n_bin, MAX_BINS_PER_REF, "n_bin")?;
let mut bins = Vec::with_capacity(n_bin);
for _ in 0..n_bin {
let bin_id = read_u32(data, pos)?;
let n_chunk = read_i32(data, pos)? as usize;
check_index_limit(n_chunk, MAX_CHUNKS_PER_BIN, "n_chunk")?;
let mut chunks = Vec::with_capacity(n_chunk);
for _ in 0..n_chunk {
let begin = VirtualOffset(read_u64(data, pos)?);
let end = VirtualOffset(read_u64(data, pos)?);
chunks.push(Chunk { begin, end });
}
bins.push(Bin { bin_id, chunks });
}
let n_intv = read_i32(data, pos)? as usize;
check_index_limit(n_intv, MAX_LINEAR_INDEX, "n_intv")?;
let mut linear_index = Vec::with_capacity(n_intv);
for _ in 0..n_intv {
linear_index.push(VirtualOffset(read_u64(data, pos)?));
}
references.push(RefIndex { bins, linear_index });
}
Ok(BamIndex { references })
}
pub(super) fn decompress_bgzf_file(compressed: &[u8]) -> Result<Vec<u8>, BaiError> {
let mut result = Vec::with_capacity(compressed.len().wrapping_mul(3));
let mut offset = 0;
let mut decompressor = libdeflater::Decompressor::new();
while offset < compressed.len() {
let remaining = compressed.len().wrapping_sub(offset);
if remaining < 18 {
break; }
debug_assert!(remaining >= 18, "BGZF block too short: remaining={remaining}");
#[expect(clippy::indexing_slicing, reason = "remaining >= 18 checked above")]
let block = &compressed[offset..];
if block.get(..2) != Some(&[0x1f, 0x8b]) {
return Err(BaiError::Bgzf { source: BgzfError::InvalidMagic });
}
#[expect(clippy::indexing_slicing, reason = "remaining >= 18 checked above")]
let bsize = (u16::from_le_bytes([block[16], block[17]]) as usize).wrapping_add(1);
if offset.wrapping_add(bsize) > compressed.len() {
return Err(BaiError::Bgzf { source: BgzfError::TruncatedBlock });
}
if bsize < 18 {
#[expect(clippy::cast_possible_truncation, reason = "bsize < 18")]
return Err(BaiError::Bgzf {
source: BgzfError::BlockSizeTooSmall { bsize: bsize as u16 },
});
}
debug_assert!(bsize >= 18, "bsize should be >= 18 at this point: bsize={bsize}");
#[expect(clippy::indexing_slicing, reason = "bsize >= 18 checked above")]
let isize_bytes = &block[bsize.wrapping_sub(4)..bsize];
let isize = u32::from_le_bytes(
isize_bytes.try_into().unwrap_or_else(|_| unreachable!("bsize >= 18")),
) as usize;
if isize == 0 {
offset = offset.wrapping_add(bsize);
continue; }
if bsize < 26 {
#[expect(clippy::cast_possible_truncation, reason = "bsize < 26")]
return Err(BaiError::Bgzf {
source: BgzfError::BlockSizeTooSmall { bsize: bsize as u16 },
});
}
debug_assert!(bsize >= 26, "bsize should be >= 26 at this point: bsize={bsize}");
#[allow(clippy::indexing_slicing, reason = "bsize >= 26 checked above")]
let cdata = &block[18..bsize.wrapping_sub(8)];
let start = result.len();
result.resize(start.wrapping_add(isize), 0);
debug_assert!(
start.wrapping_add(isize) <= result.len(),
"result overrun: {} > {}",
start.wrapping_add(isize),
result.len()
);
#[allow(clippy::indexing_slicing, reason = "just resized to start + isize")]
decompressor
.deflate_decompress(cdata, &mut result[start..start.wrapping_add(isize)])
.map_err(|source| BaiError::Bgzf {
source: BgzfError::DecompressionFailed { source },
})?;
offset = offset.wrapping_add(bsize);
}
Ok(result)
}
pub fn bin_level(bin_id: u32) -> u8 {
match bin_id {
0 => 0,
1..=8 => 1,
9..=72 => 2,
73..=584 => 3,
585..=4680 => 4,
4681..=37449 => 5,
_ => 6, }
}
pub fn bin_span(level: u8) -> u64 {
match level {
0 => 1 << 29, 1 => 1 << 26, 2 => 1 << 23, 3 => 1 << 20, 4 => 1 << 17, 5 => 1 << 14, _ => 0,
}
}
fn linear_index_min(ref_idx: &RefIndex, start: u64) -> VirtualOffset {
let window = (start / 16384) as usize;
ref_idx.linear_index.get(window).copied().unwrap_or(VirtualOffset(0))
}
fn reg2bins(beg: u64, end: u64) -> Vec<u32> {
let mut bins = Vec::with_capacity(32);
bins.push(0);
let levels: [(u32, u32); 5] = [
(1, 26), (9, 23), (73, 20), (585, 17), (4681, 14), ];
for &(offset, shift) in &levels {
#[expect(
clippy::cast_possible_truncation,
reason = "BAM positions are capped at 2^29; shifted by ≥14 gives values ≤ 2^15, fits in u32"
)]
let mut k = offset.wrapping_add((beg >> shift) as u32);
#[expect(
clippy::cast_possible_truncation,
reason = "BAM positions are capped at 2^29; shifted by ≥14 gives values ≤ 2^15, fits in u32"
)]
let end_k = offset.wrapping_add((end.saturating_sub(1) >> shift) as u32);
while k <= end_k {
bins.push(k);
k = k.wrapping_add(1);
}
}
bins
}
fn read_i32(data: &[u8], pos: &mut usize) -> Result<i32, BaiError> {
let bytes: [u8; 4] = data
.get(*pos..pos.wrapping_add(4))
.and_then(|s| s.try_into().ok())
.ok_or(BaiError::Truncated)?;
*pos = pos.wrapping_add(4);
Ok(i32::from_le_bytes(bytes))
}
fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, BaiError> {
let bytes: [u8; 4] = data
.get(*pos..pos.wrapping_add(4))
.and_then(|s| s.try_into().ok())
.ok_or(BaiError::Truncated)?;
*pos = pos.wrapping_add(4);
Ok(u32::from_le_bytes(bytes))
}
fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, BaiError> {
let bytes: [u8; 8] = data
.get(*pos..pos.wrapping_add(8))
.and_then(|s| s.try_into().ok())
.ok_or(BaiError::Truncated)?;
*pos = pos.wrapping_add(8);
Ok(u64::from_le_bytes(bytes))
}
#[cfg(test)]
#[allow(clippy::arithmetic_side_effects, reason = "tests")]
#[allow(clippy::cast_possible_truncation, reason = "tests")]
mod tests {
use super::*;
use seqair_types::Pos0;
#[test]
fn reg2bins_includes_bin0() {
let bins = reg2bins(0, 100);
assert!(bins.contains(&0));
}
#[test]
fn reg2bins_small_region() {
let bins = reg2bins(100, 200);
assert!(bins.contains(&0));
assert!(bins.contains(&(4681 + (100 >> 14) as u32)));
}
fn index_with_bins(bins: Vec<(u32, Vec<Chunk>)>) -> BamIndex {
BamIndex {
references: vec![RefIndex {
bins: bins.into_iter().map(|(bin_id, chunks)| Bin { bin_id, chunks }).collect(),
linear_index: vec![VirtualOffset(0); 64],
}],
}
}
fn chunk(begin: u64, end: u64) -> Chunk {
Chunk { begin: VirtualOffset(begin), end: VirtualOffset(end) }
}
#[test]
fn query_split_separates_bin0() {
let idx = index_with_bins(vec![
(0, vec![chunk(9000, 9500)]), (4681, vec![chunk(100, 500)]), ]);
let result = idx.query_split(0, Pos0::new(0).unwrap(), Pos0::new(100).unwrap());
assert_eq!(result.distant.len(), 1, "bin 0 should have 1 chunk");
assert_eq!(result.distant[0].begin.0, 9000);
assert!(!result.nearby.is_empty(), "regular should have chunks");
assert!(
result.nearby.iter().all(|c| c.begin.0 != 9000),
"bin 0 chunk should not appear in regular"
);
}
#[test]
fn query_split_union_equals_query() {
let idx = index_with_bins(vec![
(0, vec![chunk(9000, 9500)]),
(4681, vec![chunk(100, 500)]),
(4682, vec![chunk(500, 900)]),
]);
let split = idx.query_split(0, Pos0::new(0).unwrap(), Pos0::new(200).unwrap());
let flat = idx.query(0, Pos0::new(0).unwrap(), Pos0::new(200).unwrap());
let mut combined: Vec<u64> =
split.nearby.iter().chain(&split.distant).map(|c| c.begin.0).collect();
combined.sort();
let mut flat_begins: Vec<u64> = flat.iter().map(|c| c.begin.0).collect();
flat_begins.sort();
assert_eq!(combined, flat_begins, "split union must equal flat query");
}
#[test]
fn query_split_no_bin0_in_index() {
let idx = index_with_bins(vec![(4681, vec![chunk(100, 500)])]);
let result = idx.query_split(0, Pos0::new(0).unwrap(), Pos0::new(100).unwrap());
assert!(result.distant.is_empty(), "no bin 0 in index → empty bin0");
assert!(!result.nearby.is_empty());
}
#[test]
fn query_split_empty_reference() {
let idx = BamIndex { references: Vec::new() };
let result = idx.query_split(0, Pos0::new(0).unwrap(), Pos0::new(100).unwrap());
assert!(result.nearby.is_empty());
assert!(result.distant.is_empty());
}
#[test]
fn reg2bin_at_64mb_boundary_returns_bin0() {
let boundary: u64 = 1 << 26; let beg = boundary - 10;
let end = boundary + 10;
let bins = reg2bins(beg, end);
assert!(bins.contains(&0), "read spanning 64 MB boundary must include bin 0");
let beg_l1 = 1 + (beg >> 26) as u32;
let end_l1 = 1 + ((end - 1) >> 26) as u32;
assert_ne!(beg_l1, end_l1, "beg and end should be in different level-1 bins");
}
#[test]
fn reg2bin_not_at_boundary_still_has_bin0_in_candidates() {
let bins = reg2bins(1000, 1100);
assert!(bins.contains(&0));
}
#[test]
fn from_path_invalid_magic_returns_error() {
use std::io::Write;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("bad.bai");
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(b"NOTBAI\x01\x00extra").unwrap();
let err = BamIndex::from_path(&path).unwrap_err();
assert!(matches!(err, BaiError::InvalidMagic));
}
#[test]
fn from_path_too_short_returns_error() {
use std::io::Write;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("short.bai");
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(b"BAI").unwrap(); let err = BamIndex::from_path(&path).unwrap_err();
assert!(matches!(err, BaiError::InvalidMagic));
}
#[test]
fn from_tabix_path_wrong_magic_returns_invalid_tabix_magic() {
use std::io::Write;
#[rustfmt::skip]
let bgzf_block: &[u8] = &[
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x22, 0x00,
0x01, 0x04, 0x00, 0xfb, 0xff, 0xde, 0xad, 0xbe, 0xef,
0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00,
];
#[rustfmt::skip]
let bgzf_eof: &[u8] = &[
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x1b, 0x00,
0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("bad_magic.tbi");
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(bgzf_block).unwrap();
f.write_all(bgzf_eof).unwrap();
drop(f);
let err = BamIndex::from_tabix_path(&path).unwrap_err();
match err {
BaiError::InvalidTabixMagic { found } => {
assert_eq!(found, [0xde, 0xad, 0xbe, 0xef]);
}
other => panic!("expected InvalidTabixMagic, got {other:?}"),
}
}
#[test]
fn decompress_bgzf_rejects_bsize_too_small() {
let mut block = vec![0x1f, 0x8b, 0x08, 0x04]; block.extend_from_slice(&[0; 4]); block.push(0); block.push(0xff); block.extend_from_slice(&6u16.to_le_bytes()); block.extend_from_slice(&[b'B', b'C', 2, 0]); block.extend_from_slice(&16u16.to_le_bytes());
block.resize(18, 0);
let result = decompress_bgzf_file(&block);
assert!(result.is_err(), "should reject bsize < 18");
let err = result.unwrap_err();
assert!(
matches!(err, BaiError::Bgzf { source: BgzfError::BlockSizeTooSmall { .. } }),
"expected BlockSizeTooSmall, got {err:?}"
);
}
#[test]
fn query_split_merges_overlapping_nearby_chunks() {
let idx = index_with_bins(vec![
(73, vec![chunk(500, 800)]), (4685, vec![chunk(700, 1000)]), ]);
let result = idx.query_split(0, Pos0::new(65500).unwrap(), Pos0::new(65700).unwrap());
assert_eq!(
result.nearby.len(),
1,
"overlapping nearby chunks must be merged, got: {:?}",
result.nearby.iter().map(|c| (c.begin.0, c.end.0)).collect::<Vec<_>>()
);
assert_eq!(result.nearby[0].begin.0, 500);
assert_eq!(result.nearby[0].end.0, 1000);
}
#[test]
fn query_split_merges_adjacent_nearby_chunks() {
let idx =
index_with_bins(vec![(73, vec![chunk(500, 700)]), (4685, vec![chunk(700, 1000)])]);
let result = idx.query_split(0, Pos0::new(65500).unwrap(), Pos0::new(65700).unwrap());
assert_eq!(
result.nearby.len(),
1,
"adjacent chunks must be merged, got: {:?}",
result.nearby.iter().map(|c| (c.begin.0, c.end.0)).collect::<Vec<_>>()
);
}
#[test]
fn query_split_keeps_non_overlapping_nearby_chunks_separate() {
let idx =
index_with_bins(vec![(73, vec![chunk(500, 700)]), (4685, vec![chunk(900, 1200)])]);
let result = idx.query_split(0, Pos0::new(65500).unwrap(), Pos0::new(65700).unwrap());
assert_eq!(
result.nearby.len(),
2,
"non-overlapping chunks must stay separate, got: {:?}",
result.nearby.iter().map(|c| (c.begin.0, c.end.0)).collect::<Vec<_>>()
);
}
fn merge(pairs: &[(u64, u64)]) -> Vec<(u64, u64)> {
let mut chunks: Vec<Chunk> = pairs.iter().map(|&(b, e)| chunk(b, e)).collect();
chunks.sort_by_key(|c| c.begin);
merge_overlapping_chunks(&mut chunks);
chunks.iter().map(|c| (c.begin.0, c.end.0)).collect()
}
#[test]
fn merge_empty() {
assert_eq!(merge(&[]), Vec::<(u64, u64)>::new());
}
#[test]
fn merge_single() {
assert_eq!(merge(&[(10, 20)]), vec![(10, 20)]);
}
#[test]
fn merge_subset_chunk() {
assert_eq!(merge(&[(10, 100), (30, 50)]), vec![(10, 100)]);
}
#[test]
fn merge_chain_of_three() {
assert_eq!(merge(&[(10, 30), (20, 50), (40, 70)]), vec![(10, 70)]);
}
#[test]
fn merge_mixed_overlap_and_gap() {
assert_eq!(
merge(&[(10, 30), (20, 50), (100, 200), (150, 250)]),
vec![(10, 50), (100, 250)]
);
}
#[test]
fn merge_identical_chunks() {
assert_eq!(merge(&[(10, 20), (10, 20), (10, 20)]), vec![(10, 20)]);
}
use proptest::prelude::*;
fn chunk_strategy() -> impl Strategy<Value = (u64, u64)> {
(0u64..10_000).prop_flat_map(|begin| (Just(begin), begin + 1..begin + 5_000))
}
proptest! {
#[test]
fn merge_output_is_sorted_and_non_overlapping(
chunks in prop::collection::vec(chunk_strategy(), 0..20)
) {
let merged = merge(&chunks);
for w in merged.windows(2) {
prop_assert!(w[0].0 < w[1].0, "not sorted: {:?}", merged);
}
for w in merged.windows(2) {
prop_assert!(w[1].0 > w[0].1, "overlapping after merge: {:?}", merged);
}
}
#[test]
fn merge_covers_same_positions(
chunks in prop::collection::vec(chunk_strategy(), 1..20)
) {
let merged = merge(&chunks);
for &(b, e) in &chunks {
let mid = b + (e - b) / 2;
prop_assert!(
merged.iter().any(|&(mb, me)| mb <= mid && mid <= me),
"midpoint {} of [{}, {}] not covered by merged {:?}",
mid, b, e, merged
);
}
}
}
}