use super::aux_data::{AuxData, AuxDataError};
use super::cigar::CigarOp;
use super::seq;
use crate::io::reg2bin;
use seqair_types::{BamFlags, Base, BaseQuality, Pos0};
use thiserror::Error;
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum OwnedRecordError {
#[error("qname length {len} exceeds maximum of 254 bytes")]
QnameTooLong { len: usize },
#[error("CIGAR op count {count} exceeds maximum of 65535")]
CigarCountOverflow { count: usize },
#[error("sequence length {len} exceeds i32::MAX")]
SeqLengthOverflow { len: usize },
#[error("seq length {seq_len} != qual length {qual_len}")]
SeqQualLengthMismatch { seq_len: usize, qual_len: usize },
#[error("CIGAR query length {cigar_query_len} != seq length {seq_len}")]
CigarSeqLengthMismatch { cigar_query_len: u64, seq_len: usize },
#[error("record size {size} exceeds limit")]
RecordTooLarge { size: usize },
#[error("aux data error")]
AuxData {
#[from]
source: AuxDataError,
},
}
#[inline]
fn pos_to_bam_i32(p: Option<Pos0>) -> i32 {
p.map_or(-1, Pos0::as_i32)
}
#[derive(Debug, Clone)]
pub struct OwnedBamRecord {
pub ref_id: i32,
pub pos: Option<Pos0>,
pub mapq: u8,
pub flags: BamFlags,
pub next_ref_id: i32,
pub next_pos: Option<Pos0>,
pub template_len: i32,
pub qname: Vec<u8>,
pub cigar: Vec<CigarOp>,
pub seq: Vec<Base>,
pub qual: Vec<BaseQuality>,
pub aux: AuxData,
}
pub struct OwnedBamRecordBuilder {
ref_id: i32,
pos: Option<Pos0>,
mapq: u8,
flags: BamFlags,
next_ref_id: i32,
next_pos: Option<Pos0>,
template_len: i32,
qname: Vec<u8>,
cigar: Vec<CigarOp>,
seq: Vec<Base>,
qual: Vec<BaseQuality>,
aux: AuxData,
}
impl OwnedBamRecordBuilder {
pub fn flags(mut self, flags: BamFlags) -> Self {
self.flags = flags;
self
}
pub fn mapq(mut self, mapq: u8) -> Self {
self.mapq = mapq;
self
}
pub fn cigar(mut self, cigar: Vec<CigarOp>) -> Self {
self.cigar = cigar;
self
}
pub fn seq(mut self, seq: Vec<Base>) -> Self {
self.seq = seq;
self
}
pub fn qual(mut self, qual: Vec<BaseQuality>) -> Self {
self.qual = qual;
self
}
pub fn next_ref_id(mut self, v: i32) -> Self {
self.next_ref_id = v;
self
}
pub fn next_pos(mut self, v: Option<Pos0>) -> Self {
self.next_pos = v;
self
}
pub fn template_len(mut self, v: i32) -> Self {
self.template_len = v;
self
}
pub fn aux(mut self, aux: AuxData) -> Self {
self.aux = aux;
self
}
pub fn build(self) -> Result<OwnedBamRecord, OwnedRecordError> {
if self.qname.len() > 254 {
return Err(OwnedRecordError::QnameTooLong { len: self.qname.len() });
}
if self.cigar.len() > 65535 {
return Err(OwnedRecordError::CigarCountOverflow { count: self.cigar.len() });
}
if self.seq.len() > i32::MAX as usize {
return Err(OwnedRecordError::SeqLengthOverflow { len: self.seq.len() });
}
if !self.qual.is_empty() && self.qual.len() != self.seq.len() {
return Err(OwnedRecordError::SeqQualLengthMismatch {
seq_len: self.seq.len(),
qual_len: self.qual.len(),
});
}
Ok(OwnedBamRecord {
ref_id: self.ref_id,
pos: self.pos,
mapq: self.mapq,
flags: self.flags,
next_ref_id: self.next_ref_id,
next_pos: self.next_pos,
template_len: self.template_len,
qname: self.qname,
cigar: self.cigar,
seq: self.seq,
qual: self.qual,
aux: self.aux,
})
}
}
impl OwnedBamRecord {
pub fn builder(ref_id: i32, pos: Option<Pos0>, qname: Vec<u8>) -> OwnedBamRecordBuilder {
OwnedBamRecordBuilder {
ref_id,
pos,
mapq: 0,
flags: BamFlags::empty(),
next_ref_id: -1,
next_pos: None,
template_len: 0,
qname,
cigar: Vec::new(),
seq: Vec::new(),
qual: Vec::new(),
aux: AuxData::new(),
}
}
pub fn set_flags(&mut self, flags: BamFlags) {
self.flags = flags;
}
pub fn is_unmapped(&self) -> bool {
self.flags.is_unmapped()
}
pub fn is_reverse(&self) -> bool {
self.flags.is_reverse()
}
pub fn end_pos(&self) -> Option<Pos0> {
let pos = self.pos?;
let ref_len: u64 =
self.cigar.iter().filter(|op| op.consumes_ref()).map(|op| u64::from(op.len())).sum();
let end = pos.as_u64().saturating_add(ref_len);
let clamped = end.min(Pos0::max_value().as_u64());
#[expect(
clippy::cast_possible_truncation,
reason = "clamped <= i32::MAX < u32::MAX; fits in u32"
)]
Pos0::new(clamped as u32)
}
pub fn bin(&self) -> u16 {
let beg = self.pos.map(Pos0::as_u64).unwrap_or(0);
let end = self.end_pos().map_or(beg.saturating_add(1), |p| {
let v = p.as_u64();
if v <= beg { beg.saturating_add(1) } else { v }
});
#[expect(
clippy::cast_possible_truncation,
reason = "BAI bin numbers with min_shift=14, depth=5 are bounded by 37449, fits in u16"
)]
let bin = reg2bin(beg, end, 14, 5) as u16;
bin
}
pub fn cigar_query_len(&self) -> u64 {
self.cigar.iter().filter(|op| op.consumes_query()).map(|op| u64::from(op.len())).sum()
}
pub fn set_alignment(
&mut self,
pos: Option<Pos0>,
cigar: Vec<CigarOp>,
) -> Result<(), OwnedRecordError> {
if cigar.len() > 65535 {
return Err(OwnedRecordError::CigarCountOverflow { count: cigar.len() });
}
if !self.is_unmapped() && !self.seq.is_empty() {
let query_len: u64 =
cigar.iter().filter(|op| op.consumes_query()).map(|op| u64::from(op.len())).sum();
if query_len != self.seq.len() as u64 {
return Err(OwnedRecordError::CigarSeqLengthMismatch {
cigar_query_len: query_len,
seq_len: self.seq.len(),
});
}
}
self.pos = pos;
self.cigar = cigar;
Ok(())
}
pub fn set_seq(&mut self, seq: Vec<Base>) -> Result<(), OwnedRecordError> {
if !self.is_unmapped() && !self.cigar.is_empty() {
let query_len = self.cigar_query_len();
if query_len != seq.len() as u64 {
return Err(OwnedRecordError::CigarSeqLengthMismatch {
cigar_query_len: query_len,
seq_len: seq.len(),
});
}
}
self.seq = seq;
Ok(())
}
pub fn seq_mut(&mut self) -> &mut [Base] {
&mut self.seq
}
pub fn set_qual(&mut self, qual: Vec<BaseQuality>) -> Result<(), OwnedRecordError> {
if !qual.is_empty() && qual.len() != self.seq.len() {
return Err(OwnedRecordError::SeqQualLengthMismatch {
seq_len: self.seq.len(),
qual_len: qual.len(),
});
}
self.qual = qual;
Ok(())
}
pub fn to_bam_bytes(&self, buf: &mut Vec<u8>) -> Result<(), OwnedRecordError> {
if self.qname.len() > 254 {
return Err(OwnedRecordError::QnameTooLong { len: self.qname.len() });
}
if self.cigar.len() > 65535 {
return Err(OwnedRecordError::CigarCountOverflow { count: self.cigar.len() });
}
if self.seq.len() > i32::MAX as usize {
return Err(OwnedRecordError::SeqLengthOverflow { len: self.seq.len() });
}
#[expect(
clippy::cast_possible_truncation,
reason = "qname.len() ≤ 254 (validated above); +1 fits in u8"
)]
let l_read_name = (self.qname.len() as u8).saturating_add(1); #[expect(
clippy::cast_possible_truncation,
reason = "cigar.len() ≤ 65535 (validated above); fits in u16"
)]
let n_cigar_op = self.cigar.len() as u16;
#[expect(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
reason = "seq.len() ≤ i32::MAX (validated above); fits in i32"
)]
let l_seq = self.seq.len() as i32;
super::record::encode_fixed_header(
buf,
&super::record::FixedHeaderFields {
ref_id: self.ref_id,
pos: pos_to_bam_i32(self.pos),
bin: self.bin(),
mapq: self.mapq,
l_read_name,
flags: self.flags.raw(),
n_cigar_op,
l_seq,
next_ref_id: self.next_ref_id,
next_pos: pos_to_bam_i32(self.next_pos),
template_len: self.template_len,
},
);
buf.extend_from_slice(&self.qname);
buf.push(0);
for op in &self.cigar {
buf.extend_from_slice(&op.to_bam_u32().to_le_bytes());
}
if !self.seq.is_empty() {
let seq_bytes: Vec<u8> = self.seq.iter().map(|b| *b as u8).collect();
let encoded = seq::encode_seq(&seq_bytes);
buf.extend_from_slice(&encoded);
}
if self.qual.is_empty() {
buf.resize(buf.len().saturating_add(self.seq.len()), 0xFF);
} else {
buf.extend_from_slice(BaseQuality::slice_to_bytes(&self.qual));
}
buf.extend_from_slice(self.aux.as_bytes());
Ok(())
}
}
#[cfg(test)]
#[allow(clippy::arithmetic_side_effects, reason = "test arithmetic on known small values")]
mod tests {
use super::super::cigar::CigarOpType;
use super::*;
use seqair_types::Pos0;
fn simple_record() -> OwnedBamRecord {
OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"read1".to_vec())
.flags(BamFlags::empty()) .mapq(30)
.cigar(vec![CigarOp::new(CigarOpType::Match, 5)])
.seq(vec![Base::A, Base::C, Base::G, Base::T, Base::A])
.qual([30, 31, 32, 33, 34].map(BaseQuality::from_byte).to_vec())
.build()
.unwrap()
}
use super::super::test_util::decode_into_store;
#[test]
fn serialize_and_decode_roundtrip() {
let rec = simple_record();
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let store = decode_into_store(&buf);
let decoded = store.record(0);
assert_eq!(decoded.tid, 0);
assert_eq!(*decoded.pos, 100);
assert_eq!(decoded.mapq, 30);
assert_eq!(decoded.flags, BamFlags::empty());
assert_eq!(decoded.seq_len, 5);
assert_eq!(decoded.n_cigar_ops, 1);
assert_eq!(store.qname(0), b"read1");
}
#[test]
fn serialize_with_aux_tags() {
let mut aux = AuxData::new();
aux.set_int(*b"NM", 3).unwrap();
aux.set_string(*b"RG", b"grp1");
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r1".to_vec())
.cigar(vec![CigarOp::new(CigarOpType::Match, 3)])
.seq(vec![Base::A, Base::C, Base::G])
.qual([30, 31, 32].map(BaseQuality::from_byte).to_vec())
.aux(aux)
.build()
.unwrap();
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let store = decode_into_store(&buf);
let aux = store.aux(0);
let nm = super::super::aux::find_tag(aux, *b"NM");
assert_eq!(nm, Some(super::super::aux::AuxValue::U8(3)));
let rg = super::super::aux::find_tag(aux, *b"RG");
assert_eq!(rg, Some(super::super::aux::AuxValue::String(b"grp1")));
}
#[test]
fn bin_recomputed_after_modification() {
let mut rec = simple_record(); let bin_before = rec.bin();
rec.set_alignment(
Some(Pos0::new(10_000_000).unwrap()),
vec![CigarOp::new(CigarOpType::Match, 5)],
)
.unwrap();
let bin_after = rec.bin();
assert_ne!(bin_before, bin_after, "bin must change when pos changes");
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let bin_mq_nl = u32::from_le_bytes([buf[8], buf[9], buf[10], buf[11]]);
let serialized_bin = (bin_mq_nl >> 16) as u16;
assert_eq!(serialized_bin, bin_after, "serialized bin must match current bin()");
}
#[test]
fn end_pos_simple() {
let rec = simple_record();
assert_eq!(rec.end_pos(), Some(Pos0::new(105).unwrap())); }
#[test]
fn end_pos_with_insertion() {
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r".to_vec())
.cigar(vec![
CigarOp::new(CigarOpType::Match, 3),
CigarOp::new(CigarOpType::Insertion, 2),
CigarOp::new(CigarOpType::Match, 3),
])
.seq(vec![Base::A; 8]) .qual(vec![BaseQuality::from_byte(30); 8])
.build()
.unwrap();
assert_eq!(rec.end_pos(), Some(Pos0::new(106).unwrap()));
}
#[test]
fn qname_too_long_rejected() {
let long_name = vec![b'A'; 255];
let result = OwnedBamRecord::builder(0, None, long_name).build();
assert!(result.is_err());
}
#[test]
fn qname_at_limit_accepted() {
let name = vec![b'A'; 254];
let result = OwnedBamRecord::builder(0, None, name).build();
assert!(result.is_ok());
}
#[test]
fn set_alignment_validates_cigar_seq_mismatch() {
let mut rec = simple_record(); let result = rec.set_alignment(
Some(Pos0::new(200).unwrap()),
vec![CigarOp::new(CigarOpType::Match, 10)], );
assert!(result.is_err());
}
#[test]
fn set_alignment_succeeds_when_matching() {
let mut rec = simple_record(); let result = rec.set_alignment(
Some(Pos0::new(200).unwrap()),
vec![CigarOp::new(CigarOpType::Match, 5)],
);
assert!(result.is_ok());
assert_eq!(rec.pos, Some(Pos0::new(200).unwrap()));
assert_eq!(rec.end_pos(), Some(Pos0::new(205).unwrap()));
}
#[test]
fn empty_qual_fills_with_0xff() {
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r".to_vec())
.cigar(vec![CigarOp::new(CigarOpType::Match, 3)])
.seq(vec![Base::A, Base::C, Base::G])
.build()
.unwrap();
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let store = decode_into_store(&buf);
assert!(store.qual(0).iter().all(|&q| q == BaseQuality::UNAVAILABLE));
}
#[test]
fn unmapped_record_serializes() {
let rec = OwnedBamRecord::builder(-1, None, b"unmapped".to_vec())
.flags(BamFlags::from(0x4))
.seq(vec![Base::A, Base::C, Base::G])
.build()
.unwrap();
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let ref_id = i32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]);
let pos = i32::from_le_bytes([buf[4], buf[5], buf[6], buf[7]]);
assert_eq!(ref_id, -1);
assert_eq!(pos, -1);
}
#[test]
fn placed_unmapped_record_roundtrips() {
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(500).unwrap()), b"placed".to_vec())
.flags(BamFlags::from(0x4))
.seq(vec![Base::A, Base::C, Base::G])
.build()
.unwrap();
let mut buf = Vec::new();
rec.to_bam_bytes(&mut buf).unwrap();
let store = decode_into_store(&buf);
let decoded = store.record(0);
assert_eq!(decoded.n_cigar_ops, 0);
assert_eq!(decoded.seq_len, 3);
assert_eq!(decoded.flags & BamFlags::from(0x4), BamFlags::from(0x4));
}
#[test]
fn aligned_pairs_simple_match() {
use super::super::aligned_pairs::{AlignedPair, MatchKind};
let rec = simple_record(); let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
assert_eq!(pairs.len(), 5);
assert_eq!(
pairs[0],
AlignedPair::Match { qpos: 0, rpos: Pos0::new(100).unwrap(), kind: MatchKind::Match }
);
assert_eq!(
pairs[4],
AlignedPair::Match { qpos: 4, rpos: Pos0::new(104).unwrap(), kind: MatchKind::Match }
);
}
#[test]
fn aligned_pairs_with_insertion() {
use super::super::aligned_pairs::{AlignedPair, MatchKind};
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r".to_vec())
.cigar(vec![
CigarOp::new(CigarOpType::Match, 2),
CigarOp::new(CigarOpType::Insertion, 1),
CigarOp::new(CigarOpType::Match, 2),
])
.seq(vec![Base::A; 5])
.qual(vec![BaseQuality::from_byte(30); 5])
.build()
.unwrap();
let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
let m = |q, r| AlignedPair::Match {
qpos: q,
rpos: Pos0::new(r).unwrap(),
kind: MatchKind::Match,
};
assert_eq!(pairs.len(), 5);
assert_eq!(pairs[0], m(0, 100));
assert_eq!(pairs[1], m(1, 101));
assert_eq!(pairs[2], AlignedPair::Insertion { qpos: 2, insert_len: 1 });
assert_eq!(pairs[3], m(3, 102));
assert_eq!(pairs[4], m(4, 103));
}
#[test]
fn aligned_pairs_with_deletion() {
use super::super::aligned_pairs::{AlignedPair, MatchKind};
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r".to_vec())
.cigar(vec![
CigarOp::new(CigarOpType::Match, 2),
CigarOp::new(CigarOpType::Deletion, 3),
CigarOp::new(CigarOpType::Match, 2),
])
.seq(vec![Base::A; 4])
.qual(vec![BaseQuality::from_byte(30); 4])
.build()
.unwrap();
let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
let m = |q, r| AlignedPair::Match {
qpos: q,
rpos: Pos0::new(r).unwrap(),
kind: MatchKind::Match,
};
assert_eq!(pairs.len(), 5); assert_eq!(pairs[0], m(0, 100));
assert_eq!(pairs[1], m(1, 101));
assert_eq!(pairs[2], AlignedPair::Deletion { rpos: Pos0::new(102).unwrap(), del_len: 3 });
assert_eq!(pairs[3], m(2, 105));
assert_eq!(pairs[4], m(3, 106));
}
#[test]
fn aligned_pairs_with_soft_clip() {
use super::super::aligned_pairs::{AlignedPair, MatchKind};
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(100).unwrap()), b"r".to_vec())
.cigar(vec![
CigarOp::new(CigarOpType::SoftClip, 2),
CigarOp::new(CigarOpType::Match, 3),
])
.seq(vec![Base::A; 5])
.qual(vec![BaseQuality::from_byte(30); 5])
.build()
.unwrap();
let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
let m = |q, r| AlignedPair::Match {
qpos: q,
rpos: Pos0::new(r).unwrap(),
kind: MatchKind::Match,
};
assert_eq!(pairs.len(), 3);
assert_eq!(pairs[0], m(2, 100));
assert_eq!(pairs[2], m(4, 102));
}
#[test]
fn aligned_pairs_empty_cigar() {
let rec = OwnedBamRecord::builder(-1, None, b"r".to_vec())
.flags(BamFlags::from(0x4))
.build()
.unwrap();
let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
assert!(pairs.is_empty());
}
#[test]
fn aligned_pairs_rejects_unmapped_with_cigar() {
let rec = OwnedBamRecord::builder(-1, None, b"r".to_vec())
.flags(BamFlags::from(0x4))
.cigar(vec![CigarOp::new(CigarOpType::Match, 5)])
.seq(vec![Base::A; 5])
.qual(vec![BaseQuality::from_byte(30); 5])
.build()
.unwrap();
let result = rec.aligned_pairs();
assert!(matches!(
result,
Err(super::super::aligned_pairs::AlignedPairsError::UnmappedWithCigar { cigar_ops: 1 })
));
}
#[test]
fn aligned_pairs_with_read_attaches_query_base() {
use super::super::aligned_pairs_view::AlignedPairWithRead;
let rec = OwnedBamRecord::builder(0, Some(Pos0::new(50).unwrap()), b"r".to_vec())
.flags(BamFlags::empty())
.cigar(vec![CigarOp::new(CigarOpType::Match, 3)])
.seq(vec![Base::A, Base::C, Base::G])
.qual([30, 31, 32].map(BaseQuality::from_byte).to_vec())
.build()
.unwrap();
let events: Vec<_> = rec.aligned_pairs_with_read().unwrap().collect();
assert_eq!(events.len(), 3);
match events[0] {
AlignedPairWithRead::Match { qpos: 0, query: Base::A, .. } => {}
other => panic!("expected Match{{qpos=0, query=A}}, got {other:?}"),
}
match events[2] {
AlignedPairWithRead::Match { qpos: 2, query: Base::G, .. } => {}
other => panic!("expected Match{{qpos=2, query=G}}, got {other:?}"),
}
}
#[test]
fn aligned_pairs_with_read_propagates_unmapped_error() {
let rec = OwnedBamRecord::builder(-1, None, b"r".to_vec())
.flags(BamFlags::from(0x4))
.cigar(vec![CigarOp::new(CigarOpType::Match, 3)])
.seq(vec![Base::A; 3])
.qual(vec![BaseQuality::from_byte(30); 3])
.build()
.unwrap();
let result = rec.aligned_pairs_with_read();
assert!(matches!(
result,
Err(super::super::aligned_pairs::AlignedPairsError::UnmappedWithCigar { cigar_ops: 1 })
));
}
#[test]
fn aligned_pairs_accepts_unmapped_with_empty_cigar() {
let rec = OwnedBamRecord::builder(-1, None, b"r".to_vec())
.flags(BamFlags::from(0x4))
.build()
.unwrap();
let pairs: Vec<_> = rec.aligned_pairs().unwrap().collect();
assert!(pairs.is_empty());
}
}