use std::ops::Neg;
use super::{
block::{self, Block, ContentType},
compression_header::CompressionHeader,
encoding::{DecodeContext, ExternalCursor},
reader::CramError,
varint,
};
use crate::bam::{
BamHeader,
cigar::CigarOp,
record_store::{CustomizeRecordStore, RecordStore},
};
use seqair_types::{BamFlags, Base, Pos0, Pos1, SmallVec, SmolStr};
use tracing::warn;
struct SliceMateInfo {
pos: i64,
end_pos: i64,
mate_line: i32,
store_idx: Option<u32>,
bam_flags: u16,
ref_id: i32,
}
#[derive(Debug)]
pub struct SliceHeader {
pub ref_seq_id: i32,
pub alignment_start: i32,
pub alignment_span: i32,
pub num_records: i32,
pub record_counter: i64,
pub num_blocks: i32,
pub block_content_ids: Vec<i32>,
pub embedded_reference: i32,
pub reference_md5: [u8; 16],
}
impl SliceHeader {
pub fn parse(data: &[u8]) -> Result<Self, CramError> {
let mut cursor: &[u8] = data;
let ref_seq_id = read_itf8(&mut cursor)?.cast_signed();
let alignment_start = read_itf8(&mut cursor)?.cast_signed();
let alignment_span = read_itf8(&mut cursor)?.cast_signed();
let num_records = read_itf8(&mut cursor)?.cast_signed();
let record_counter = read_ltf8(&mut cursor)?.cast_signed();
let num_blocks = read_itf8(&mut cursor)?.cast_signed();
let num_content_ids_raw = read_itf8(&mut cursor)?;
let num_content_ids = usize::try_from(num_content_ids_raw.cast_signed())
.map_err(|_| CramError::InvalidLength { value: num_content_ids_raw.cast_signed() })?;
super::reader::check_alloc_size(num_content_ids.saturating_mul(4), "slice content IDs")?;
let mut block_content_ids = Vec::with_capacity(num_content_ids);
for _ in 0..num_content_ids {
block_content_ids.push(read_itf8(&mut cursor)?.cast_signed());
}
let embedded_reference = read_itf8(&mut cursor)?.cast_signed();
let md5_bytes =
cursor.get(..16).ok_or(CramError::Truncated { context: "slice reference MD5" })?;
let mut reference_md5 = [0u8; 16];
reference_md5.copy_from_slice(md5_bytes);
Ok(SliceHeader {
ref_seq_id,
alignment_start,
alignment_span,
num_records,
record_counter,
num_blocks,
block_content_ids,
embedded_reference,
reference_md5,
})
}
}
#[expect(
clippy::too_many_arguments,
reason = "CRAM slice decoding requires all compression header, data, and offset parameters"
)]
pub(crate) fn decode_slice<E: CustomizeRecordStore>(
ch: &CompressionHeader,
container_data: &[u8],
slice_offset: usize,
reference_seq: &[u8],
ref_start_0based: i64,
header: &BamHeader,
read_group_ids: &[SmolStr],
tid: u32,
query_start: Pos0,
query_end: Pos0,
store: &mut RecordStore<E::Extra>,
cigar_buf: &mut Vec<CigarOp>,
bases_buf: &mut Vec<Base>,
qual_buf: &mut Vec<u8>,
aux_buf: &mut Vec<u8>,
name_buf: &mut Vec<u8>,
feature_byte_buf: &mut Vec<u8>,
cigar_ops_buf: &mut Vec<(u32, u8)>,
customize: &mut E,
rans_4x8_buf: &mut Option<super::rans::Rans4x8Buf>,
nx16_order1_buf: &mut Option<super::rans_nx16::Nx16Order1Buf>,
) -> Result<(usize, usize), CramError> {
let slice_data = container_data
.get(slice_offset..)
.ok_or(CramError::Truncated { context: "slice offset" })?;
let (slice_header_block, mut pos) =
block::parse_block_with_buf(slice_data, rans_4x8_buf.as_mut(), nx16_order1_buf.as_mut())?;
if slice_header_block.content_type != ContentType::SliceHeader {
return Err(CramError::ExpectedSliceHeader { found: slice_header_block.content_type });
}
let sh = SliceHeader::parse(&slice_header_block.data)?;
let is_multi_ref = sh.ref_seq_id == -2;
if !ch.preservation.read_names_included {
warn!(
"CRAM file has RN=false (read names not stored). Mate-pair overlap dedup will be unreliable."
);
}
if sh.num_records == 0 {
return Ok((0, 0));
}
if !is_multi_ref && sh.reference_md5 != [0u8; 16] && !reference_seq.is_empty() {
let slice_start_0based = Pos1::try_from(sh.alignment_start.max(1))
.map_err(|_| CramError::InvalidPosition { value: i64::from(sh.alignment_start) })?
.to_zero_based()
.as_i64();
let slice_ref_start_i64 = slice_start_0based.wrapping_sub(ref_start_0based);
let slice_ref_start = usize::try_from(slice_ref_start_i64)
.map_err(|_| CramError::InvalidPosition { value: slice_ref_start_i64 })?;
let span = usize::try_from(sh.alignment_span)
.map_err(|_| CramError::InvalidLength { value: sh.alignment_span })?;
let slice_ref_end = slice_ref_start.wrapping_add(span);
if let Some(slice_ref) = reference_seq.get(slice_ref_start..slice_ref_end) {
let computed = md5_uppercase_streaming(slice_ref);
if computed != sh.reference_md5 {
return Err(CramError::ReferenceMd5Mismatch {
contig: header.target_name(tid).unwrap_or("?").into(),
start: sh.alignment_start as u64,
end: sh.alignment_start.wrapping_add(sh.alignment_span) as u64,
});
}
}
}
let mut core_block: Option<Block> = None;
let mut external_blocks: SmallVec<(i32, ExternalCursor), 4> = SmallVec::new();
for _ in 0..sh.num_blocks {
let remaining =
slice_data.get(pos..).ok_or(CramError::Truncated { context: "slice block" })?;
let (blk, consumed) = block::parse_block_with_buf(
remaining,
rans_4x8_buf.as_mut(),
nx16_order1_buf.as_mut(),
)?;
pos = pos.wrapping_add(consumed);
match blk.content_type {
ContentType::CoreData => {
core_block = Some(blk);
}
ContentType::ExternalData => {
external_blocks.push((blk.content_id, ExternalCursor::new(blk.data)));
}
_ => {
}
}
}
let embedded_ref_data = if sh.embedded_reference >= 0 {
let idx = external_blocks.iter().position(|(id, _)| *id == sh.embedded_reference);
match idx {
Some(i) => external_blocks.swap_remove(i).1.into_data(),
None => Vec::new(),
}
} else {
Vec::new()
};
let effective_ref: &[u8] =
if sh.embedded_reference >= 0 { &embedded_ref_data } else { reference_seq };
let core_data = core_block.ok_or(CramError::MissingCoreDataBlock)?;
let mut ctx = DecodeContext::new(&core_data.data, external_blocks);
let mut alignment_pos = i64::from(sh.alignment_start);
let mut fetched_count = 0usize;
let mut kept_count = 0usize;
let num_records = usize::try_from(sh.num_records)
.map_err(|_| CramError::InvalidLength { value: sh.num_records })?;
let mut mate_infos: Vec<SliceMateInfo> = Vec::with_capacity(num_records);
for record_index in 0..num_records {
let effective_ref_start = if sh.embedded_reference >= 0 {
Pos1::try_from(sh.alignment_start.max(1))
.map_err(|_| CramError::InvalidPosition { value: i64::from(sh.alignment_start) })?
.to_zero_based()
.as_i64()
} else {
ref_start_0based
};
let (fetched, mate_info) = decode_record(
ch,
&sh,
is_multi_ref,
&mut ctx,
&mut alignment_pos,
effective_ref,
effective_ref_start,
read_group_ids,
tid,
query_start,
query_end,
store,
cigar_buf,
bases_buf,
qual_buf,
aux_buf,
name_buf,
feature_byte_buf,
cigar_ops_buf,
record_index,
customize,
)?;
if fetched {
fetched_count = fetched_count.wrapping_add(1);
if mate_info.store_idx.is_some() {
kept_count = kept_count.wrapping_add(1);
}
}
mate_infos.push(mate_info);
}
resolve_mate_tlen(&mate_infos, store);
Ok((fetched_count, kept_count))
}
#[expect(
clippy::too_many_arguments,
reason = "CRAM record decoding requires compression header, slice header, context, and customize parameters"
)]
fn decode_record<E: CustomizeRecordStore>(
ch: &CompressionHeader,
sh: &SliceHeader,
is_multi_ref: bool,
ctx: &mut DecodeContext<'_>,
prev_alignment_pos: &mut i64,
reference_seq: &[u8],
ref_start_0based: i64,
read_group_ids: &[SmolStr],
tid: u32,
query_start: Pos0,
query_end: Pos0,
store: &mut RecordStore<E::Extra>,
cigar_buf: &mut Vec<CigarOp>,
bases_buf: &mut Vec<Base>,
qual_buf: &mut Vec<u8>,
aux_buf: &mut Vec<u8>,
name_buf: &mut Vec<u8>,
feature_byte_buf: &mut Vec<u8>,
cigar_ops_buf: &mut Vec<(u32, u8)>,
record_index: usize,
customize: &mut E,
) -> Result<(bool, SliceMateInfo), CramError> {
let ds = &ch.data_series;
let raw_flags = ds.bam_flags.decode(ctx)?;
let raw_flags: u16 = u16::try_from(raw_flags)
.map_err(|source| CramError::InvalidBamFlags { value: raw_flags, source })?;
let bam_flags = BamFlags::from(raw_flags);
#[expect(
clippy::cast_sign_loss,
reason = "CRAM flags are small non-negative integers encoded as ITF8 i32"
)]
let cram_flags = ds.cram_flags.decode(ctx)? as u32;
let quality_as_array = cram_flags & 0x1 != 0;
let detached = cram_flags & 0x2 != 0;
let mate_downstream = cram_flags & 0x4 != 0;
let seq_unknown = cram_flags & 0x8 != 0;
let record_ref_id = if is_multi_ref { ds.ref_id.decode(ctx)? } else { sh.ref_seq_id };
let read_length_i32 = ds.read_length.decode(ctx)?;
let read_length = usize::try_from(read_length_i32)
.map_err(|_| CramError::InvalidLength { value: read_length_i32 })?;
let ap = i64::from(ds.alignment_pos.decode(ctx)?);
let alignment_pos = if ch.preservation.ap_delta {
*prev_alignment_pos = prev_alignment_pos.wrapping_add(ap);
*prev_alignment_pos
} else {
*prev_alignment_pos = ap;
ap
};
let pos_0based = Pos1::try_from(alignment_pos)
.map(|p| p.to_zero_based())
.map_err(|_| super::reader::CramError::InvalidPosition { value: alignment_pos })?;
let read_group = ds.read_group.decode(ctx)?;
name_buf.clear();
if ch.preservation.read_names_included {
ds.read_name.decode_into(ctx, name_buf)?;
}
let mut next_pos_val: i32 = -1;
let mut next_ref_id_val: i32 = -1;
let mut template_len_val: i32 = 0;
let mut mate_line: i32 = -1;
if detached {
let _mate_flags = ds.mate_flags.decode(ctx)?;
if !ch.preservation.read_names_included {
ds.read_name.decode_into(ctx, name_buf)?;
name_buf.clear();
}
next_ref_id_val = ds.next_segment_ref.decode(ctx)?;
next_pos_val = ds.next_mate_pos.decode(ctx)?.saturating_sub(1);
template_len_val = ds.template_size.decode(ctx)?;
} else if mate_downstream {
let nf = ds.next_fragment.decode(ctx)?;
#[expect(
clippy::cast_possible_wrap,
clippy::cast_possible_truncation,
reason = "record_index is bounded by num_records (i32), fits in i32"
)]
{
mate_line = (record_index as i32).wrapping_add(1).wrapping_add(nf);
}
}
let tag_line_idx_i32 = ds.tag_line.decode(ctx)?;
let tag_line_idx = usize::try_from(tag_line_idx_i32)
.map_err(|_| CramError::InvalidLength { value: tag_line_idx_i32 })?;
aux_buf.clear();
if let Some(tag_set) = ch.preservation.tag_dictionary.get(tag_line_idx) {
for entry in tag_set {
let tag_key = (i32::from(entry.tag[0]) << 16)
| (i32::from(entry.tag[1]) << 8)
| i32::from(entry.bam_type);
if let Some(enc) = ch.tag_encodings.get(&tag_key) {
aux_buf.push(entry.tag[0]);
aux_buf.push(entry.tag[1]);
aux_buf.push(entry.bam_type);
enc.decode_into(ctx, aux_buf)?;
}
}
}
if let Ok(rg_idx) = usize::try_from(read_group)
&& let Some(rg_id) = read_group_ids.get(rg_idx)
{
aux_buf.extend_from_slice(b"RGZ");
aux_buf.extend_from_slice(rg_id.as_bytes());
aux_buf.push(0); }
let is_unmapped = bam_flags.is_unmapped();
cigar_buf.clear();
bases_buf.clear();
qual_buf.clear();
bases_buf.reserve(read_length);
qual_buf.reserve(read_length);
if !is_unmapped {
let feature_count_i32 = ds.feature_count.decode(ctx)?;
let feature_count = usize::try_from(feature_count_i32)
.map_err(|_| CramError::InvalidLength { value: feature_count_i32 })?;
#[expect(
clippy::cast_possible_wrap,
reason = "tid comes from BAM header, capped at MAX_REFERENCES (1M), well within i32"
)]
let foreign_tid = is_multi_ref && record_ref_id != tid as i32;
let (ref_consumed, matching_bases, indel_bases) = if foreign_tid {
let span =
scan_features_for_refspan(ch, ctx, feature_count, read_length, feature_byte_buf)?;
(span, 0, 0)
} else {
let result = decode_features_and_reconstruct(
ch,
ctx,
feature_count,
read_length,
pos_0based.as_i64(),
ref_start_0based,
reference_seq,
cigar_buf,
bases_buf,
feature_byte_buf,
cigar_ops_buf,
)?;
(result.ref_consumed, result.matching_bases, result.indel_bases)
};
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "mapping quality is 0..=255 encoded as ITF8 i32; only lower 8 bits are valid"
)]
let mapq = ds.mapping_quality.decode(ctx)? as u8;
if quality_as_array {
if foreign_tid {
feature_byte_buf.clear();
ds.quality_score.decode_n_into(ctx, read_length, feature_byte_buf)?;
} else {
ds.quality_score.decode_n_into(ctx, read_length, qual_buf)?;
}
} else if !foreign_tid {
qual_buf.resize(read_length, 0xFF);
}
let end_pos_raw = pos_0based.as_i64().wrapping_add(i64::from(ref_consumed));
let end_pos_inclusive_raw =
if ref_consumed == 0 { pos_0based.as_i64() } else { end_pos_raw.wrapping_sub(1) };
let end_pos = Pos0::try_from(end_pos_inclusive_raw).map_err(|_| {
super::reader::CramError::InvalidPosition { value: end_pos_inclusive_raw }
})?;
if foreign_tid {
return Ok((
false,
SliceMateInfo {
pos: pos_0based.as_i64(),
end_pos: end_pos_raw,
mate_line,
store_idx: None,
bam_flags: raw_flags,
ref_id: record_ref_id,
},
));
}
if pos_0based > query_end || end_pos < query_start {
return Ok((
false,
SliceMateInfo {
pos: pos_0based.as_i64(),
end_pos: end_pos_raw,
mate_line,
store_idx: None,
bam_flags: raw_flags,
ref_id: record_ref_id,
},
));
}
let qname: &[u8] = name_buf;
let store_idx = store.push_fields(
pos_0based,
end_pos,
bam_flags,
mapq,
matching_bases,
indel_bases,
qname,
cigar_buf,
bases_buf,
qual_buf,
aux_buf,
record_ref_id,
next_ref_id_val,
next_pos_val,
template_len_val,
customize,
)?;
return Ok((
true,
SliceMateInfo {
pos: pos_0based.as_i64(),
end_pos: end_pos_raw,
mate_line,
store_idx,
bam_flags: raw_flags,
ref_id: record_ref_id,
},
));
}
if !seq_unknown {
for _ in 0..read_length {
let base_byte = ds.base.decode(ctx)?;
bases_buf.push(Base::from(base_byte));
}
}
if quality_as_array {
ds.quality_score.decode_n_into(ctx, read_length, qual_buf)?;
} else {
qual_buf.resize(read_length, 0xFF);
}
Ok((
false,
SliceMateInfo {
pos: -1,
end_pos: -1,
mate_line: -1,
store_idx: None,
bam_flags: raw_flags,
ref_id: record_ref_id,
},
))
}
#[expect(
clippy::indexing_slicing,
clippy::arithmetic_side_effects,
reason = "everything is based on infos.len()"
)]
fn resolve_mate_tlen<U>(infos: &[SliceMateInfo], store: &mut RecordStore<U>) {
let n = infos.len();
let mut resolved = vec![false; n];
for i in 0..n {
if resolved[i] || infos[i].mate_line < 0 {
continue;
}
let mut chain: Vec<usize> = Vec::with_capacity(4);
let mut idx = i;
loop {
if idx >= n || resolved[idx] {
break;
}
chain.push(idx);
resolved[idx] = true;
let ml = infos[idx].mate_line;
if ml < 0 || ml as usize >= n {
break;
}
idx = ml as usize;
if chain.len() > n {
break;
}
}
if chain.len() < 2 {
continue;
}
let mut aleft = i64::MAX;
let mut aright = i64::MIN;
let mut left_cnt = 0u32;
let mut right_cnt = 0u32;
for &j in &chain {
let info = &infos[j];
if info.pos < 0 {
continue; }
if info.pos < aleft {
aleft = info.pos;
left_cnt = 1;
} else if info.pos == aleft {
left_cnt = left_cnt.wrapping_add(1);
}
if info.end_pos > aright {
aright = info.end_pos;
right_cnt = 1;
} else if info.end_pos == aright {
right_cnt = right_cnt.wrapping_add(1);
}
}
if aright <= aleft {
continue; }
#[expect(
clippy::cast_possible_truncation,
reason = "template length fits in i32 for genomic data"
)]
let tlen_magnitude = (aright.wrapping_sub(aleft)) as i32;
let first = &infos[chain[0]];
let first_tlen = if first.pos == aleft && (first.end_pos < aright || left_cnt <= 1) {
tlen_magnitude
} else if first.pos == aleft && first.end_pos == aright && left_cnt > 1 && right_cnt > 1 {
if first.bam_flags & 0x40 != 0 { tlen_magnitude } else { tlen_magnitude.neg() }
} else {
tlen_magnitude.neg()
};
if let Some(idx) = first.store_idx {
store.set_template_len(idx, first_tlen);
}
let rest_tlen = first_tlen.neg();
for &j in &chain[1..] {
if let Some(idx) = infos[j].store_idx {
store.set_template_len(idx, rest_tlen);
}
}
for ci in 0..chain.len() {
let mate_ci = if ci + 1 < chain.len() { ci + 1 } else { 0 };
let record_idx = infos[chain[ci]].store_idx;
let mate = &infos[chain[mate_ci]];
let Some(idx) = record_idx else { continue };
if mate.store_idx.is_none() {
store.set_mate_info(idx, -1, -1);
continue;
}
#[expect(
clippy::cast_possible_truncation,
reason = "mate positions fit in i32 for genomic data"
)]
let mate_pos = if mate.pos < 0 { -1 } else { mate.pos as i32 };
store.set_mate_info(idx, mate.ref_id, mate_pos);
}
}
}
struct ReconstructResult {
ref_consumed: u32,
matching_bases: u32,
indel_bases: u32,
}
fn ref_base_at(reference_seq: &[u8], index: usize, warned: &mut bool) -> u8 {
match reference_seq.get(index) {
Some(&b) => b,
None => {
if !*warned {
warn!(
index,
ref_len = reference_seq.len(),
"reference shorter than expected during CRAM sequence reconstruction; \
substituting N (may indicate reference/CRAM mismatch)"
);
*warned = true;
}
b'N'
}
}
}
#[expect(
clippy::too_many_arguments,
reason = "sequence reconstruction requires all CRAM encoding handles, reference slice, and output buffers"
)]
fn decode_features_and_reconstruct(
ch: &CompressionHeader,
ctx: &mut DecodeContext<'_>,
feature_count: usize,
read_length: usize,
pos_0based: i64,
slice_start_0based: i64,
reference_seq: &[u8],
cigar_buf: &mut Vec<CigarOp>,
bases_buf: &mut Vec<Base>,
feature_byte_buf: &mut Vec<u8>,
cigar_ops_buf: &mut Vec<(u32, u8)>,
) -> Result<ReconstructResult, CramError> {
let ds = &ch.data_series;
let ref_offset = usize::try_from(pos_0based.wrapping_sub(slice_start_0based)).unwrap_or(0);
let mut read_pos = 0usize; let mut ref_pos = 0usize; let mut ref_warned = false; let mut matching_bases = 0u32;
let mut indel_bases = 0u32;
cigar_ops_buf.clear();
fn push_cigar_op(ops: &mut Vec<(u32, u8)>, len: u32, op: u8) {
if len == 0 {
return;
}
if let Some(last) = ops.last_mut()
&& last.1 == op
{
last.0 = last.0.saturating_add(len);
return;
}
ops.push((len, op));
}
#[expect(clippy::too_many_arguments, reason = "shared helper for ref-match emit")]
fn emit_ref_match(
reference_seq: &[u8],
ref_offset: usize,
ref_pos: &mut usize,
read_pos: &mut usize,
ref_warned: &mut bool,
bases_buf: &mut Vec<Base>,
cigar_ops: &mut Vec<(u32, u8)>,
matching_bases: &mut u32,
) {
let ref_base = ref_base_at(reference_seq, ref_offset.wrapping_add(*ref_pos), ref_warned);
bases_buf.push(Base::from(ref_base));
push_cigar_op(cigar_ops, 1, 0); *matching_bases = matching_bases.saturating_add(1);
*read_pos = read_pos.wrapping_add(1);
*ref_pos = ref_pos.wrapping_add(1);
}
let mut feature_read_pos = 0u32;
for _ in 0..feature_count {
let fc = ds.feature_code.decode(ctx)?;
let fp = ds.feature_pos.decode(ctx)? as u32;
feature_read_pos = feature_read_pos.wrapping_add(fp);
let feat_target = (feature_read_pos as usize).saturating_sub(1);
while read_pos < feat_target && read_pos < read_length {
emit_ref_match(
reference_seq,
ref_offset,
&mut ref_pos,
&mut read_pos,
&mut ref_warned,
bases_buf,
cigar_ops_buf,
&mut matching_bases,
);
}
match fc {
b'X' => {
let bs = ds.base_sub.decode(ctx)?;
let ref_base =
ref_base_at(reference_seq, ref_offset.wrapping_add(ref_pos), &mut ref_warned);
let read_base = ch.preservation.substitution_matrix.substitute(ref_base, bs);
bases_buf.push(Base::from(read_base));
push_cigar_op(cigar_ops_buf, 1, 0); matching_bases = matching_bases.saturating_add(1);
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
b'I' => {
feature_byte_buf.clear();
ds.insertion.decode_into(ctx, feature_byte_buf)?;
let len = feature_byte_buf.len();
bases_buf.reserve(len);
for &b in feature_byte_buf.iter() {
bases_buf.push(Base::from(b));
}
#[expect(
clippy::cast_possible_truncation,
reason = "insertion length is bounded by read_length (validated); fits in u32"
)]
let len_u32 = len as u32;
push_cigar_op(cigar_ops_buf, len_u32, 1); indel_bases = indel_bases.saturating_add(len_u32);
read_pos = read_pos.wrapping_add(len);
}
b'i' => {
let base = ds.base.decode(ctx)?;
bases_buf.push(Base::from(base));
push_cigar_op(cigar_ops_buf, 1, 1); indel_bases = indel_bases.saturating_add(1);
read_pos = read_pos.wrapping_add(1);
}
b'D' => {
let len = ds.deletion_length.decode(ctx)? as u32;
push_cigar_op(cigar_ops_buf, len, 2); indel_bases = indel_bases.saturating_add(len);
ref_pos = ref_pos.wrapping_add(len as usize);
}
b'S' => {
feature_byte_buf.clear();
ds.soft_clip.decode_into(ctx, feature_byte_buf)?;
let len = feature_byte_buf.len();
bases_buf.reserve(len);
for &b in feature_byte_buf.iter() {
bases_buf.push(Base::from(b));
}
#[expect(
clippy::cast_possible_truncation,
reason = "soft-clip length is bounded by read_length (validated); fits in u32"
)]
let len_u32 = len as u32;
push_cigar_op(cigar_ops_buf, len_u32, 4); read_pos = read_pos.wrapping_add(len);
}
b'H' => {
let len = ds.hard_clip.decode(ctx)? as u32;
push_cigar_op(cigar_ops_buf, len, 5); }
b'N' => {
let len = ds.ref_skip.decode(ctx)? as u32;
push_cigar_op(cigar_ops_buf, len, 3); ref_pos = ref_pos.wrapping_add(len as usize);
}
b'P' => {
let len = ds.padding.decode(ctx)? as u32;
push_cigar_op(cigar_ops_buf, len, 6); }
b'B' => {
let base = ds.base.decode(ctx)?;
let _qual = ds.quality_score.decode(ctx)?;
bases_buf.push(Base::from(base));
push_cigar_op(cigar_ops_buf, 1, 0); matching_bases = matching_bases.saturating_add(1);
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
b'b' => {
feature_byte_buf.clear();
ds.bases_block.decode_into(ctx, feature_byte_buf)?;
let len = feature_byte_buf.len();
bases_buf.reserve(len);
for &b in feature_byte_buf.iter() {
bases_buf.push(Base::from(b));
}
#[expect(
clippy::cast_possible_truncation,
reason = "bases length is bounded by read_length (validated); fits in u32"
)]
let len_u32 = len as u32;
push_cigar_op(cigar_ops_buf, len_u32, 0); matching_bases = matching_bases.saturating_add(len_u32);
read_pos = read_pos.wrapping_add(len);
ref_pos = ref_pos.wrapping_add(len);
}
b'q' => {
feature_byte_buf.clear();
ds.quality_block.decode_into(ctx, feature_byte_buf)?;
emit_ref_match(
reference_seq,
ref_offset,
&mut ref_pos,
&mut read_pos,
&mut ref_warned,
bases_buf,
cigar_ops_buf,
&mut matching_bases,
);
}
b'Q' => {
let _qual = ds.quality_score.decode(ctx)?;
emit_ref_match(
reference_seq,
ref_offset,
&mut ref_pos,
&mut read_pos,
&mut ref_warned,
bases_buf,
cigar_ops_buf,
&mut matching_bases,
);
}
_ => return Err(CramError::UnknownFeatureCode { feature_code: fc }),
}
}
while read_pos < read_length {
emit_ref_match(
reference_seq,
ref_offset,
&mut ref_pos,
&mut read_pos,
&mut ref_warned,
bases_buf,
cigar_ops_buf,
&mut matching_bases,
);
}
cigar_buf.clear();
cigar_buf.reserve(cigar_ops_buf.len());
for &(len, op) in cigar_ops_buf.iter() {
cigar_buf.push(CigarOp::from_bam_u32((len << 4) | u32::from(op)));
}
#[expect(
clippy::cast_possible_truncation,
reason = "ref_pos is bounded by reference sequence length which fits in u32; BAM positions are capped at 2^29"
)]
Ok(ReconstructResult { ref_consumed: ref_pos as u32, matching_bases, indel_bases })
}
fn scan_features_for_refspan(
ch: &CompressionHeader,
ctx: &mut DecodeContext<'_>,
feature_count: usize,
read_length: usize,
feature_byte_buf: &mut Vec<u8>,
) -> Result<u32, CramError> {
let ds = &ch.data_series;
let mut read_pos = 0usize;
let mut ref_pos = 0usize;
let mut feature_read_pos = 0u32;
for _ in 0..feature_count {
let fc = ds.feature_code.decode(ctx)?;
let fp = ds.feature_pos.decode(ctx)? as u32;
feature_read_pos = feature_read_pos.wrapping_add(fp);
let feat_target = (feature_read_pos as usize).saturating_sub(1);
let fill = feat_target.min(read_length).saturating_sub(read_pos);
read_pos = read_pos.wrapping_add(fill);
ref_pos = ref_pos.wrapping_add(fill);
match fc {
b'X' => {
let _bs = ds.base_sub.decode(ctx)?;
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
b'I' => {
feature_byte_buf.clear();
ds.insertion.decode_into(ctx, feature_byte_buf)?;
read_pos = read_pos.wrapping_add(feature_byte_buf.len());
}
b'i' => {
let _base = ds.base.decode(ctx)?;
read_pos = read_pos.wrapping_add(1);
}
b'D' => {
let len = ds.deletion_length.decode(ctx)? as u32;
ref_pos = ref_pos.wrapping_add(len as usize);
}
b'S' => {
feature_byte_buf.clear();
ds.soft_clip.decode_into(ctx, feature_byte_buf)?;
read_pos = read_pos.wrapping_add(feature_byte_buf.len());
}
b'H' => {
let _len = ds.hard_clip.decode(ctx)?;
}
b'N' => {
let len = ds.ref_skip.decode(ctx)? as u32;
ref_pos = ref_pos.wrapping_add(len as usize);
}
b'P' => {
let _len = ds.padding.decode(ctx)?;
}
b'B' => {
let _base = ds.base.decode(ctx)?;
let _qual = ds.quality_score.decode(ctx)?;
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
b'b' => {
feature_byte_buf.clear();
ds.bases_block.decode_into(ctx, feature_byte_buf)?;
let len = feature_byte_buf.len();
read_pos = read_pos.wrapping_add(len);
ref_pos = ref_pos.wrapping_add(len);
}
b'q' => {
feature_byte_buf.clear();
ds.quality_block.decode_into(ctx, feature_byte_buf)?;
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
b'Q' => {
let _qual = ds.quality_score.decode(ctx)?;
read_pos = read_pos.wrapping_add(1);
ref_pos = ref_pos.wrapping_add(1);
}
_ => return Err(CramError::UnknownFeatureCode { feature_code: fc }),
}
}
let tail = read_length.saturating_sub(read_pos);
ref_pos = ref_pos.wrapping_add(tail);
#[expect(
clippy::cast_possible_truncation,
reason = "ref_pos bounded by reference length which fits in u32; BAM positions are capped at 2^29"
)]
Ok(ref_pos as u32)
}
fn read_itf8(cursor: &mut &[u8]) -> Result<u32, CramError> {
varint::read_itf8_from(cursor).ok_or(CramError::Truncated { context: "slice header itf8" })
}
fn md5_uppercase_streaming(bytes: &[u8]) -> [u8; 16] {
let mut ctx = md5::Context::new();
let mut buf = [0u8; 4096];
for chunk in bytes.chunks(buf.len()) {
let dst = buf.get_mut(..chunk.len()).expect("chunks(N) yields chunks of length ≤ N");
for (d, &s) in dst.iter_mut().zip(chunk) {
*d = s.to_ascii_uppercase();
}
ctx.consume(&*dst);
}
ctx.finalize().into()
}
fn read_ltf8(cursor: &mut &[u8]) -> Result<u64, CramError> {
varint::read_ltf8_from(cursor).ok_or(CramError::Truncated { context: "slice header ltf8" })
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prelude::*;
fn load_test_cram() -> Vec<u8> {
std::fs::read(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.cram")).unwrap()
}
fn md5_uppercase_alloc(bytes: &[u8]) -> [u8; 16] {
let upper: Vec<u8> = bytes.iter().map(|b| b.to_ascii_uppercase()).collect();
md5::compute(&upper).into()
}
proptest! {
#[test]
fn md5_uppercase_streaming_matches_alloc(
bytes in proptest::collection::vec(0u8..=255, 0..16_384),
) {
prop_assert_eq!(md5_uppercase_streaming(&bytes), md5_uppercase_alloc(&bytes));
}
}
#[test]
fn md5_uppercase_streaming_known_vectors() {
let empty_md5: [u8; 16] = md5::compute(b"").into();
assert_eq!(md5_uppercase_streaming(b""), empty_md5);
let acgt_md5: [u8; 16] = md5::compute(b"ACGT").into();
assert_eq!(md5_uppercase_streaming(b"acgt"), acgt_md5);
let mixed_md5: [u8; 16] = md5::compute(b"ACGTNACGT").into();
assert_eq!(md5_uppercase_streaming(b"AcGtNaCgT"), mixed_md5);
}
#[test]
fn parse_slice_header_from_real_cram() {
use super::super::container::ContainerHeader;
let data = load_test_cram();
let after_file_def = &data[26..];
let hdr_container = ContainerHeader::parse(after_file_def).unwrap();
let data_start = hdr_container.header_size + hdr_container.length as usize;
let dc_bytes = &after_file_def[data_start..];
let dc = ContainerHeader::parse(dc_bytes).unwrap();
let container_data = &dc_bytes[dc.header_size..];
let (comp_block, _) = block::parse_block(container_data).unwrap();
assert_eq!(comp_block.content_type, ContentType::CompressionHeader);
let first_landmark = *dc.landmarks.first().unwrap();
let slice_data = &container_data[first_landmark as usize..];
let (slice_header_block, _) = block::parse_block(slice_data).unwrap();
assert_eq!(slice_header_block.content_type, ContentType::SliceHeader);
let sh = SliceHeader::parse(&slice_header_block.data).unwrap();
assert!(sh.num_records > 0, "slice should have records");
assert!(sh.alignment_start > 0, "slice should have valid alignment start");
assert!(sh.num_blocks > 0, "slice should have blocks");
}
#[test]
fn md5_mismatch_detected_with_wrong_reference() {
use super::super::{compression_header::CompressionHeader, container::ContainerHeader};
let data = load_test_cram();
let after_file_def = &data[26..];
let hdr = ContainerHeader::parse(after_file_def).unwrap();
let data_start = hdr.header_size + hdr.length as usize;
let dc_bytes = &after_file_def[data_start..];
let dc = ContainerHeader::parse(dc_bytes).unwrap();
let container_data = &dc_bytes[dc.header_size..];
let (comp_block, _) = block::parse_block(container_data).unwrap();
let ch = CompressionHeader::parse(&comp_block.data).unwrap();
let bam_header =
crate::bam::BamHeader::from_sam_text("@HD\tVN:1.6\n@SQ\tSN:chr19\tLN:61431566\n")
.unwrap();
let fake_ref = vec![b'N'; 100_000];
let ref_start = Pos1::try_from(dc.alignment_start.max(1))
.map(|p| p.to_zero_based().as_i64())
.unwrap_or(0);
let first_landmark = *dc.landmarks.first().unwrap();
let result = decode_slice(
&ch,
container_data,
first_landmark as usize,
&fake_ref,
ref_start,
&bam_header,
&[],
0,
Pos0::new(0).unwrap(),
Pos0::max_value(),
&mut crate::bam::record_store::RecordStore::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut Vec::new(),
&mut (),
&mut None,
&mut None,
);
assert!(
matches!(result, Err(CramError::ReferenceMd5Mismatch { .. })),
"should detect MD5 mismatch, got: {result:?}"
);
}
#[test]
fn expected_slice_header_wrong_block_type() {
let data = b"\x01\x02\x03";
let block_bytes = block::build_test_block(4, 0, data); let (blk, _) = block::parse_block(&block_bytes).unwrap();
assert_eq!(blk.content_type, ContentType::ExternalData);
let err = CramError::ExpectedSliceHeader { found: blk.content_type };
assert!(matches!(err, CramError::ExpectedSliceHeader { found: ContentType::ExternalData }));
}
#[test]
fn missing_core_data_block_error_variant() {
let err = CramError::MissingCoreDataBlock;
let msg = format!("{err}");
assert!(
msg.contains("core data block"),
"error message should mention core data block: {msg}"
);
}
#[test]
fn unknown_feature_code_error_variant() {
let err = CramError::UnknownFeatureCode { feature_code: 0xFF };
assert!(matches!(err, CramError::UnknownFeatureCode { feature_code: 0xFF }));
let msg = format!("{err}");
assert!(
msg.contains("ff") || msg.contains("0xff") || msg.contains("255"),
"error message should contain the code: {msg}"
);
}
#[test]
fn ref_base_at_warns_on_out_of_bounds() {
let reference = b"ACGT";
let mut warned = false;
let base = ref_base_at(reference, 0, &mut warned);
assert_eq!(base, b'A');
assert!(!warned, "should not warn for in-bounds access");
let base = ref_base_at(reference, 10, &mut warned);
assert_eq!(base, b'N');
assert!(warned, "should warn for out-of-bounds access");
let base = ref_base_at(reference, 20, &mut warned);
assert_eq!(base, b'N');
}
#[test]
fn negative_itf8_num_content_ids_rejected() {
let mut data = vec![
0x00, 0x01, 0x01, 0x00, 0x00, 0x00, ];
data.extend_from_slice(&[0xFF, 0xFF, 0xFF, 0xFF, 0xFF]);
let result = SliceHeader::parse(&data);
assert!(
matches!(result, Err(CramError::InvalidLength { value: -1 })),
"expected InvalidLength error for negative num_content_ids, got: {result:?}"
);
}
}