use super::aligned_pairs::MatchKind;
use super::aligned_pairs_view::{AlignedPairWithRef, AlignedPairsWithRef};
use seqair_types::Base;
use thiserror::Error;
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum NmMdError {
#[error(
"reference base missing for MD computation at rpos={rpos}; either the loaded RefSeq \
doesn't cover this position or the CIGAR walks past the end"
)]
MissingReference { rpos: u32 },
}
impl<'cigar, 'read, 'ref_seq> AlignedPairsWithRef<'cigar, 'read, 'ref_seq> {
pub fn nm(self) -> u32 {
let mut nm: u32 = 0;
for ev in self {
match ev {
AlignedPairWithRef::Match { kind, query, ref_base, .. } => {
nm = nm.saturating_add(match_contribution(kind, query, ref_base));
}
AlignedPairWithRef::Insertion { query, .. } => {
let len = u32::try_from(query.len()).unwrap_or(u32::MAX);
nm = nm.saturating_add(len);
}
AlignedPairWithRef::Deletion { del_len, .. } => {
nm = nm.saturating_add(del_len);
}
AlignedPairWithRef::RefSkip { .. }
| AlignedPairWithRef::SoftClip { .. }
| AlignedPairWithRef::Padding { .. }
| AlignedPairWithRef::Unknown { .. } => {
}
}
}
nm
}
pub fn md(self) -> Result<Vec<u8>, NmMdError> {
let mut out: Vec<u8> = Vec::new();
let mut run: u32 = 0; let mut last_was_deletion = false;
for ev in self {
match ev {
AlignedPairWithRef::Match { kind, query, ref_base, rpos, .. } => {
let rb = ref_base.ok_or(NmMdError::MissingReference { rpos: *rpos })?;
let is_match = match kind {
MatchKind::SeqMatch => true,
MatchKind::SeqMismatch => false,
MatchKind::Match => bases_equal(query, rb),
};
if is_match {
run = run.saturating_add(1);
last_was_deletion = false;
} else {
push_number(&mut out, run);
out.push(base_to_md_byte(rb));
run = 0;
last_was_deletion = false;
}
}
AlignedPairWithRef::Deletion { del_len, ref_bases, rpos } => {
let bases = ref_bases.ok_or(NmMdError::MissingReference { rpos: *rpos })?;
debug_assert_eq!(bases.len(), del_len as usize);
if last_was_deletion {
out.push(b'0');
}
push_number(&mut out, run);
out.push(b'^');
for &b in bases {
out.push(base_to_md_byte(b));
}
run = 0;
last_was_deletion = true;
}
AlignedPairWithRef::Insertion { .. }
| AlignedPairWithRef::RefSkip { .. }
| AlignedPairWithRef::SoftClip { .. }
| AlignedPairWithRef::Padding { .. }
| AlignedPairWithRef::Unknown { .. } => {
last_was_deletion = false;
}
}
}
push_number(&mut out, run);
Ok(out)
}
}
fn match_contribution(kind: MatchKind, query: Base, ref_base: Option<Base>) -> u32 {
match kind {
MatchKind::SeqMatch => 0,
MatchKind::SeqMismatch => 1,
MatchKind::Match => {
match ref_base {
Some(rb) if !bases_equal(query, rb) => 1,
_ => 0,
}
}
}
}
fn bases_equal(a: Base, b: Base) -> bool {
a == b && a != Base::Unknown
}
fn base_to_md_byte(b: Base) -> u8 {
b as u8
}
fn push_number(out: &mut Vec<u8>, n: u32) {
if n == 0 {
out.push(b'0');
return;
}
let mut buf = [0u8; 10];
let mut i = buf.len();
let mut n = n;
while n > 0 {
i = i.saturating_sub(1);
let digit = (n % 10) as u8;
#[allow(clippy::indexing_slicing, reason = "i bounded by buf.len() above")]
{
buf[i] = b'0'.saturating_add(digit);
}
n /= 10;
}
#[allow(clippy::indexing_slicing, reason = "i is a valid start index into buf")]
out.extend_from_slice(&buf[i..]);
}
#[cfg(test)]
#[allow(clippy::arithmetic_side_effects, reason = "test arithmetic on small values")]
mod tests {
use super::super::cigar::{CigarOp, CigarOpType};
use super::super::owned_record::OwnedBamRecord;
use super::super::pileup::RefSeq;
use super::super::record_store::RecordStore;
use super::*;
use seqair_types::{BamFlags, Base, BaseQuality, Pos0};
use std::rc::Rc;
fn op(t: CigarOpType, len: u32) -> CigarOp {
CigarOp::new(t, len)
}
fn p0(v: u32) -> Pos0 {
Pos0::new(v).unwrap()
}
fn store_with_record(pos: Pos0, cigar: Vec<CigarOp>, seq: Vec<Base>) -> RecordStore<()> {
let qual = vec![BaseQuality::from_byte(30); seq.len()];
let owned = OwnedBamRecord::builder(0, Some(pos), b"r".to_vec())
.flags(BamFlags::empty())
.cigar(cigar)
.seq(seq)
.qual(qual)
.build()
.unwrap();
let mut buf = Vec::new();
owned.to_bam_bytes(&mut buf).unwrap();
let mut store = RecordStore::<()>::new();
let _ = store.push_raw(&buf, &mut ()).unwrap();
store
}
fn ref_window(start: u32, ascii: &[u8]) -> RefSeq {
let bases: Vec<Base> = ascii.iter().map(|&b| Base::from(b)).collect();
RefSeq::new(Rc::from(bases), p0(start))
}
#[test]
fn nm_match_only_no_mismatches_is_zero() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACGTA");
let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 0);
}
#[test]
fn nm_counts_mismatches_in_match_op() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACGAA"); let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 1);
}
#[test]
fn nm_counts_insertions_and_deletions() {
let cigar = vec![
op(CigarOpType::Match, 2),
op(CigarOpType::Insertion, 3),
op(CigarOpType::Match, 2),
op(CigarOpType::Deletion, 1),
op(CigarOpType::Match, 1),
];
let seq: Vec<Base> = b"AAGGGCCAA".iter().map(|&b| Base::from(b)).collect(); let _ = seq.len(); let seq: Vec<Base> = b"AAGGGCCAA"[..8].iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"AACCXA"); let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 4);
}
#[test]
fn nm_seqmatch_and_seqmismatch_dont_consult_ref() {
let cigar = vec![
op(CigarOpType::SeqMatch, 2),
op(CigarOpType::SeqMismatch, 3),
op(CigarOpType::SeqMatch, 2),
];
let seq: Vec<Base> = b"AAACCCC".iter().map(|&b| Base::from(b)).collect(); let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"AAACCCC");
let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 3, "X always contributes regardless of ref agreement");
}
#[test]
fn nm_skips_match_positions_outside_loaded_ref() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(200, b"AAAAA");
let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 0);
}
#[test]
fn nm_treats_n_as_mismatch() {
let cigar = vec![op(CigarOpType::Match, 1)];
let seq = vec![Base::Unknown];
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"A");
let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
assert_eq!(nm, 1);
}
#[test]
fn md_perfect_match_is_just_length_number() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACGTA");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(md, b"5");
}
#[test]
fn md_single_mismatch_emits_ref_base() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACTTA");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(md, b"2T2");
}
#[test]
fn md_consecutive_mismatches() {
let cigar = vec![op(CigarOpType::Match, 5)];
let seq: Vec<Base> = b"ACGTA".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"TCGGT");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(std::str::from_utf8(&md).unwrap(), "0T2G0T0");
}
#[test]
fn md_deletion_emits_caret_and_ref_bases() {
let cigar = vec![
op(CigarOpType::Match, 2),
op(CigarOpType::Deletion, 3),
op(CigarOpType::Match, 2),
];
let seq: Vec<Base> = b"AATT".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"AAACGTT");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(std::str::from_utf8(&md).unwrap(), "2^ACG2");
}
#[test]
fn md_skips_insertions_and_soft_clips() {
let cigar = vec![
op(CigarOpType::SoftClip, 2),
op(CigarOpType::Match, 2),
op(CigarOpType::Insertion, 3),
op(CigarOpType::Match, 2),
];
let seq: Vec<Base> = b"NNAAGGGCC".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"AACC");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(md, b"4");
}
#[test]
fn md_starts_with_zero_when_first_position_mismatches() {
let cigar = vec![op(CigarOpType::Match, 3)];
let seq: Vec<Base> = b"TGC".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"AGC");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(md, b"0A2");
}
#[test]
fn md_seqmatch_and_seqmismatch_use_ref_base_for_mismatch_letter() {
let cigar = vec![
op(CigarOpType::SeqMatch, 2),
op(CigarOpType::SeqMismatch, 1),
op(CigarOpType::SeqMatch, 2),
];
let seq: Vec<Base> = b"ACATT".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACGTT");
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(md, b"2G2");
}
#[test]
fn md_errors_on_missing_reference() {
let cigar = vec![op(CigarOpType::Match, 3)];
let seq: Vec<Base> = b"ACG".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(200, b"AAA");
let result =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).md();
assert!(matches!(result, Err(NmMdError::MissingReference { rpos: 100 })));
}
#[test]
fn nm_and_md_realistic_record() {
let cigar = vec![
op(CigarOpType::SoftClip, 2),
op(CigarOpType::Match, 3),
op(CigarOpType::Insertion, 1),
op(CigarOpType::Match, 2),
op(CigarOpType::Deletion, 1),
op(CigarOpType::Match, 2),
];
let seq: Vec<Base> = b"NNACGXGGTT".iter().map(|&b| Base::from(b)).collect();
let store = store_with_record(p0(100), cigar, seq);
let ref_seq = ref_window(100, b"ACTGGTTT");
let nm =
store.record(0).aligned_pairs_with_read(&store).unwrap().with_reference(&ref_seq).nm();
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
assert_eq!(nm, 3);
assert_eq!(md, b"2T2^T2");
}
fn parse_md_for_counts(md: &[u8]) -> (u32, u32) {
let mut mismatches = 0u32;
let mut deletions = 0u32;
let mut i = 0;
while i < md.len() {
#[allow(clippy::indexing_slicing, reason = "i bounded by md.len()")]
let b = md[i];
match b {
b'0'..=b'9' => {
i = i.saturating_add(1);
}
b'^' => {
i = i.saturating_add(1);
while i < md.len() {
#[allow(clippy::indexing_slicing, reason = "i bounded by md.len()")]
let c = md[i];
if c.is_ascii_alphabetic() {
deletions = deletions.saturating_add(1);
i = i.saturating_add(1);
} else {
break;
}
}
}
b'A'..=b'Z' | b'a'..=b'z' => {
mismatches = mismatches.saturating_add(1);
i = i.saturating_add(1);
}
_ => {
i = i.saturating_add(1);
}
}
}
(mismatches, deletions)
}
#[test]
fn parse_md_for_counts_matches_known_strings() {
assert_eq!(parse_md_for_counts(b"5"), (0, 0));
assert_eq!(parse_md_for_counts(b"2T2"), (1, 0));
assert_eq!(parse_md_for_counts(b"2^ACG2"), (0, 3));
assert_eq!(parse_md_for_counts(b"0T2G0T0"), (3, 0));
assert_eq!(parse_md_for_counts(b"2T2^T2"), (1, 1));
}
mod proptests {
use super::super::super::cigar::CigarOp;
use super::*;
use proptest::prelude::*;
fn arb_cigar_op() -> impl Strategy<Value = CigarOp> {
let ops = prop_oneof![
Just(CigarOpType::Match),
Just(CigarOpType::Insertion),
Just(CigarOpType::Deletion),
Just(CigarOpType::SoftClip),
Just(CigarOpType::SeqMatch),
Just(CigarOpType::SeqMismatch),
];
(ops, 1u32..=8u32).prop_map(|(t, len)| CigarOp::new(t, len))
}
proptest! {
#[test]
fn nm_equals_md_mismatches_plus_md_deletions_plus_cigar_insertions(
ops in proptest::collection::vec(arb_cigar_op(), 1..=8),
read_pos in 0u32..=1_000u32,
) {
let qlen: u32 = ops
.iter()
.filter(|o| o.op_type().consumes_query())
.map(|o| o.len())
.sum();
let rlen: u32 = ops
.iter()
.filter(|o| o.op_type().consumes_ref())
.map(|o| o.len())
.sum();
if qlen == 0 {
return Ok(()); }
let bases = [Base::A, Base::C, Base::G, Base::T];
let seq: Vec<Base> =
(0..qlen).map(|i| bases[(i % 4) as usize]).collect();
let qual: Vec<BaseQuality> =
(0..qlen).map(|_| BaseQuality::from_byte(30)).collect();
let owned = OwnedBamRecord::builder(0, Some(p0(read_pos)), b"r".to_vec())
.flags(BamFlags::empty())
.cigar(ops.clone())
.seq(seq)
.qual(qual)
.build()
.unwrap();
let mut buf = Vec::new();
owned.to_bam_bytes(&mut buf).unwrap();
let mut store = RecordStore::<()>::new();
let _ = store.push_raw(&buf, &mut ()).unwrap();
let ref_pat = [Base::C, Base::G, Base::T, Base::A];
let ref_buf: Vec<Base> =
(0..rlen).map(|i| ref_pat[(i % 4) as usize]).collect();
let ref_seq = RefSeq::new(Rc::from(ref_buf), p0(read_pos));
let nm = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.nm();
let md = store
.record(0)
.aligned_pairs_with_read(&store)
.unwrap()
.with_reference(&ref_seq)
.md()
.unwrap();
let (md_mismatches, md_deletions) = parse_md_for_counts(&md);
let cigar_insertions: u32 = ops
.iter()
.filter(|o| matches!(o.op_type(), CigarOpType::Insertion))
.map(|o| o.len())
.sum();
let derived_nm = md_mismatches
.saturating_add(md_deletions)
.saturating_add(cigar_insertions);
prop_assert_eq!(
nm, derived_nm,
"NM ({}) != mismatches({}) + deletions({}) + insertions({}); MD={:?}",
nm, md_mismatches, md_deletions, cigar_insertions, std::str::from_utf8(&md)
);
prop_assert!(!md.is_empty(), "MD must never be empty");
prop_assert!(
md[0].is_ascii_digit(),
"MD must start with a digit: {:?}",
std::str::from_utf8(&md),
);
prop_assert!(
md[md.len() - 1].is_ascii_digit(),
"MD must end with a digit: {:?}",
std::str::from_utf8(&md),
);
}
}
}
#[test]
fn push_number_handles_zero_and_multidigit() {
let mut buf = Vec::new();
push_number(&mut buf, 0);
assert_eq!(buf, b"0");
let mut buf = Vec::new();
push_number(&mut buf, 1);
assert_eq!(buf, b"1");
let mut buf = Vec::new();
push_number(&mut buf, 9);
assert_eq!(buf, b"9");
let mut buf = Vec::new();
push_number(&mut buf, 10);
assert_eq!(buf, b"10");
let mut buf = Vec::new();
push_number(&mut buf, 12345);
assert_eq!(buf, b"12345");
let mut buf = Vec::new();
push_number(&mut buf, u32::MAX);
assert_eq!(buf, b"4294967295");
}
}