use super::{reader::CramError, varint};
#[derive(Debug)]
pub struct ContainerHeader {
pub length: i32,
pub ref_seq_id: i32,
pub alignment_start: i32,
pub alignment_span: i32,
pub num_records: i32,
pub record_counter: i64,
pub bases: i64,
pub num_blocks: i32,
pub landmarks: Vec<i32>,
pub header_size: usize,
}
impl ContainerHeader {
pub fn parse(buf: &[u8]) -> Result<Self, CramError> {
let mut pos: usize = 0;
let length_end = pos
.checked_add(4)
.ok_or(CramError::Truncated { context: "container header length pos" })?;
let length_bytes = buf
.get(pos..length_end)
.ok_or(CramError::Truncated { context: "container header length" })?;
let length = i32::from_le_bytes(
length_bytes
.try_into()
.map_err(|_| CramError::Truncated { context: "container header length" })?,
);
pos = length_end;
let (ref_seq_id, n) = read_itf8_at(buf, pos, "container ref_seq_id")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container ref_seq_id pos" })?;
let (alignment_start, n) = read_itf8_at(buf, pos, "container alignment_start")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container alignment_start pos" })?;
let (alignment_span, n) = read_itf8_at(buf, pos, "container alignment_span")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container alignment_span pos" })?;
let (num_records, n) = read_itf8_at(buf, pos, "container num_records")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container num_records pos" })?;
let (record_counter_raw, n) = read_ltf8_at(buf, pos, "container record_counter")?;
let record_counter = record_counter_raw.cast_signed();
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container record_counter pos" })?;
let (bases_raw, n) = read_ltf8_at(buf, pos, "container bases")?;
let bases = bases_raw.cast_signed();
pos = pos.checked_add(n).ok_or(CramError::Truncated { context: "container bases pos" })?;
let (num_blocks, n) = read_itf8_at(buf, pos, "container num_blocks")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container num_blocks pos" })?;
let (landmark_count, n) = read_itf8_at(buf, pos, "container landmark count")?;
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container landmark count pos" })?;
let landmark_count_usize = landmark_count as usize;
super::reader::check_alloc_size(
landmark_count_usize.saturating_mul(4),
"container landmark array",
)?;
let mut landmarks = Vec::with_capacity(landmark_count_usize);
for _ in 0..landmark_count {
let (landmark, n) = read_itf8_at(buf, pos, "container landmark")?;
landmarks.push(landmark.cast_signed());
pos = pos
.checked_add(n)
.ok_or(CramError::Truncated { context: "container landmark pos" })?;
}
let crc_end = pos
.checked_add(4)
.ok_or(CramError::Truncated { context: "container header CRC32 pos" })?;
let crc_bytes = buf
.get(pos..crc_end)
.ok_or(CramError::Truncated { context: "container header CRC32" })?;
let expected_crc = u32::from_le_bytes(
crc_bytes
.try_into()
.map_err(|_| CramError::Truncated { context: "container header CRC32" })?,
);
let crc_data = buf
.get(..pos)
.ok_or(CramError::Truncated { context: "container header CRC32 range" })?;
let mut crc = libdeflater::Crc::new();
crc.update(crc_data);
if crc.sum() != expected_crc {
return Err(CramError::ChecksumMismatch {
context: "container header",
expected: expected_crc,
found: crc.sum(),
});
}
pos = pos
.checked_add(4)
.ok_or(CramError::Truncated { context: "container pos after CRC" })?;
Ok(ContainerHeader {
length,
ref_seq_id: ref_seq_id.cast_signed(),
alignment_start: alignment_start.cast_signed(),
alignment_span: alignment_span.cast_signed(),
num_records: num_records.cast_signed(),
record_counter,
bases,
num_blocks: num_blocks.cast_signed(),
landmarks,
header_size: pos,
})
}
pub fn is_eof(&self) -> bool {
self.num_records == 0 && self.length <= 15
}
pub fn overlaps(&self, query_start: i64, query_end: i64) -> bool {
if self.ref_seq_id < 0 {
return self.ref_seq_id == -2;
}
let container_start = i64::from(self.alignment_start);
let container_end = container_start.saturating_add(i64::from(self.alignment_span));
container_start < query_end && container_end > query_start
}
}
fn read_itf8_at(buf: &[u8], pos: usize, context: &'static str) -> Result<(u32, usize), CramError> {
let slice = buf.get(pos..).ok_or(CramError::Truncated { context })?;
varint::decode_itf8(slice).ok_or(CramError::Truncated { context })
}
fn read_ltf8_at(buf: &[u8], pos: usize, context: &'static str) -> Result<(u64, usize), CramError> {
let slice = buf.get(pos..).ok_or(CramError::Truncated { context })?;
varint::decode_ltf8(slice).ok_or(CramError::Truncated { context })
}
#[cfg(test)]
#[allow(
clippy::arithmetic_side_effects,
clippy::indexing_slicing,
reason = "test-only arithmetic and indexing on known-valid CRAM file data"
)]
mod tests {
use super::*;
#[test]
fn parse_header_container_from_real_cram() {
let data =
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram"))
.unwrap();
let after_file_def = &data[26..];
let header = ContainerHeader::parse(after_file_def).unwrap();
assert_eq!(header.num_records, 0, "header container should have 0 records");
assert!(header.length > 0, "header container should have positive length");
assert!(!header.is_eof(), "header container should not be EOF");
}
#[test]
fn parse_data_container_from_real_cram() {
let data =
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram"))
.unwrap();
let after_file_def = &data[26..];
let header_container = ContainerHeader::parse(after_file_def).unwrap();
let data_start = header_container.header_size + header_container.length as usize;
let after_header = &after_file_def[data_start..];
let data_container = ContainerHeader::parse(after_header).unwrap();
assert!(data_container.num_records > 0, "data container should have records");
assert!(
data_container.ref_seq_id >= 0 || data_container.ref_seq_id == -2,
"data container should have valid ref_seq_id"
);
assert!(data_container.alignment_start > 0);
assert!(data_container.alignment_span > 0);
assert!(!data_container.landmarks.is_empty(), "should have landmarks");
}
#[test]
fn find_eof_container_in_real_cram() {
let data =
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram"))
.unwrap();
let mut pos = 26;
let mut found_eof = false;
let mut container_count = 0;
while pos < data.len() {
let remaining = &data[pos..];
let Ok(container) = ContainerHeader::parse(remaining) else { break };
container_count += 1;
if container.is_eof() {
found_eof = true;
break;
}
pos += container.header_size + container.length as usize;
}
assert!(found_eof, "should find EOF container");
assert!(
container_count >= 3,
"should have at least header + data + EOF containers, found {container_count}"
);
}
#[test]
fn parse_header_block_extracts_sam_header() {
use super::super::block;
let data =
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram"))
.unwrap();
let after_file_def = &data[26..];
let header_container = ContainerHeader::parse(after_file_def).unwrap();
let block_data = &after_file_def[header_container.header_size..];
let (blk, _) = block::parse_block(block_data).unwrap();
assert_eq!(blk.content_type, block::ContentType::FileHeader);
let header_len =
i32::from_le_bytes([blk.data[0], blk.data[1], blk.data[2], blk.data[3]]) as usize;
let header_text = std::str::from_utf8(&blk.data[4..4 + header_len]).unwrap();
assert!(header_text.starts_with("@HD"), "should start with @HD header line");
assert!(header_text.contains("@SQ"), "should contain @SQ sequence dictionary");
}
#[test]
fn container_header_matches_htslib() {
use super::super::block;
use crate::bam::BamHeader;
use rust_htslib::bam::Read as _;
let data =
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram"))
.unwrap();
let after_file_def = &data[26..];
let header_container = ContainerHeader::parse(after_file_def).unwrap();
let block_data = &after_file_def[header_container.header_size..];
let (blk, _) = block::parse_block(block_data).unwrap();
let header_len =
i32::from_le_bytes([blk.data[0], blk.data[1], blk.data[2], blk.data[3]]) as usize;
let header_text = std::str::from_utf8(&blk.data[4..4 + header_len]).unwrap();
let header = BamHeader::from_sam_text(header_text).unwrap();
let htslib_bam = rust_htslib::bam::Reader::from_path(concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../tests/data/test.bam"
))
.unwrap();
let htslib_header = htslib_bam.header();
let our_names: Vec<&str> = header.target_names().collect();
let htslib_names = htslib_header.target_names();
assert_eq!(our_names.len(), htslib_names.len(), "target count mismatch");
for (ours, theirs) in our_names.iter().zip(htslib_names.iter()) {
let theirs_str = std::str::from_utf8(theirs).unwrap();
assert_eq!(*ours, theirs_str, "target name mismatch");
}
}
#[test]
fn overlaps_query() {
let header = ContainerHeader {
length: 0,
ref_seq_id: 0,
alignment_start: 100,
alignment_span: 100,
num_records: 10,
record_counter: 0,
bases: 1000,
num_blocks: 1,
landmarks: vec![],
header_size: 0,
};
assert!(header.overlaps(50, 150)); assert!(header.overlaps(150, 250)); assert!(header.overlaps(100, 200)); assert!(header.overlaps(120, 180)); assert!(header.overlaps(50, 250));
assert!(!header.overlaps(200, 300)); assert!(!header.overlaps(0, 100)); assert!(!header.overlaps(300, 400)); }
}