use seqair_types::Base;
use thiserror::Error;
use super::aux::{AuxValue, GetAuxError, find_tag};
use super::cigar::{CigarMapping, CigarPosInfo};
use super::record_store::{RecordAccessError, RecordStore, SlimRecord};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModType {
Code(u8),
ChEBI(u32),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModStrand {
Plus,
Minus,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModMode {
Implicit,
Unmodified,
Ambiguous,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Modification {
pub mod_type: ModType,
pub probability: u8,
pub canonical_base: Base,
pub strand: ModStrand,
}
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum BaseModError {
#[error("invalid canonical base in MM tag: {0:#x}")]
InvalidCanonicalBase(u8),
#[error("canonical base 'N' is not supported for modification resolution")]
UnsupportedUnknownBase,
#[error("invalid strand byte in MM tag: {0:#x}")]
InvalidStrand(u8),
#[error("missing modification code in MM entry")]
MissingModCode,
#[error("invalid modification code byte: {0:#x}")]
InvalidModCode(u8),
#[error("invalid ChEBI numeric id in MM tag")]
InvalidChebi,
#[error("invalid position delta in MM tag")]
InvalidDelta,
#[error("MM entry missing terminating ';'")]
MissingTerminator,
#[error("ML array length {ml} does not match total MM call count {mm}")]
MlLengthMismatch { ml: usize, mm: usize },
#[error(
"MM delta {delta} exceeds remaining count of canonical base {base:?} in sequence \
(is_reverse={is_reverse})"
)]
DeltaOutOfRange { delta: u32, base: Base, is_reverse: bool },
#[error("sequence length {len} exceeds u32::MAX; stored qpos cannot be represented")]
SeqTooLong { len: usize },
}
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum FromRecordError {
#[error(transparent)]
RecordAccess(#[from] RecordAccessError),
#[error("MM tag has wrong BAM type: {0}")]
BadMmTag(GetAuxError),
#[error("ML tag has wrong BAM type: expected B:C, got {got}")]
BadMlTag { got: &'static str },
#[error(transparent)]
Parse(#[from] BaseModError),
}
#[derive(Debug, Clone, Default)]
pub struct BaseModState {
qpos: Vec<u32>,
mods: Vec<Modification>,
any_unmodified_mode: [bool; 4],
}
impl BaseModState {
pub fn parse(
mm: &[u8],
ml: &[u8],
seq: &[Base],
is_reverse: bool,
) -> Result<Self, BaseModError> {
let mut state = BaseModState::default();
let mut pending: Vec<(u32, Modification)> = Vec::new();
let mut ml_cursor: usize = 0;
let mm = mm.strip_suffix(&[0]).unwrap_or(mm);
let mut rest = mm;
while !rest.is_empty() {
let (entry, next) = split_entry(rest)?;
rest = next;
if entry.is_empty() {
continue;
}
parse_entry(entry, seq, is_reverse, ml, &mut ml_cursor, &mut pending, &mut state)?;
}
if ml_cursor != ml.len() {
return Err(BaseModError::MlLengthMismatch { ml: ml.len(), mm: ml_cursor });
}
pending.sort_by_key(|&(qp, _)| qp);
state.qpos.reserve(pending.len());
state.mods.reserve(pending.len());
for (qp, m) in pending {
state.qpos.push(qp);
state.mods.push(m);
}
Ok(state)
}
pub fn from_record<U>(
rec: &SlimRecord,
store: &RecordStore<U>,
) -> Result<Option<Self>, FromRecordError> {
let aux = rec.aux(store)?;
let mm: &[u8] = match aux.get("MM") {
Ok(bytes) => bytes,
Err(GetAuxError::TagNotFound { .. }) => return Ok(None),
Err(other) => return Err(FromRecordError::BadMmTag(other)),
};
let ml: &[u8] = match find_tag(aux.as_bytes(), *b"ML") {
Some(AuxValue::ArrayU8(bytes)) => bytes,
Some(other) => return Err(FromRecordError::BadMlTag { got: other.type_name() }),
None => &[],
};
let seq = rec.seq(store)?;
let state = Self::parse(mm, ml, seq, rec.flags.is_reverse())?;
Ok(Some(state))
}
pub fn len(&self) -> usize {
self.mods.len()
}
pub fn is_empty(&self) -> bool {
self.mods.is_empty()
}
pub fn mod_at_qpos(&self, qpos: usize) -> Option<&[Modification]> {
let target = u32::try_from(qpos).ok()?;
let lo = self.qpos.partition_point(|&q| q < target);
let hi = self.qpos.partition_point(|&q| q <= target);
if lo == hi {
return None;
}
self.mods.get(lo..hi)
}
pub fn mod_at_ref_pos(
&self,
ref_pos: seqair_types::Pos0,
cigar: &CigarMapping,
) -> Option<&[Modification]> {
match cigar.pos_info_at(ref_pos)? {
CigarPosInfo::Match { qpos } | CigarPosInfo::Insertion { qpos, .. } => {
self.mod_at_qpos(qpos as usize)
}
CigarPosInfo::Deletion { .. }
| CigarPosInfo::RefSkip
| CigarPosInfo::ComplexIndel { .. } => None,
}
}
pub fn is_unmodified(&self, qpos: usize, canonical_base: Base) -> Option<bool> {
if self.mod_at_qpos(qpos).is_some() {
return Some(false);
}
let idx = canonical_base.known_index()?;
#[allow(clippy::indexing_slicing, reason = "known_index() returns Some(0..=3) for ACGT")]
let has_unmodified_mode = self.any_unmodified_mode[idx];
if has_unmodified_mode { Some(true) } else { None }
}
}
fn split_entry(mm: &[u8]) -> Result<(&[u8], &[u8]), BaseModError> {
let sc = mm.iter().position(|&b| b == b';').ok_or(BaseModError::MissingTerminator)?;
let (entry, rest) = mm.split_at(sc);
let rest = rest.get(1..).unwrap_or(&[]);
Ok((entry, rest))
}
#[allow(clippy::too_many_arguments, reason = "parser scratch state")]
fn parse_entry(
entry: &[u8],
seq: &[Base],
is_reverse: bool,
ml: &[u8],
ml_cursor: &mut usize,
pending: &mut Vec<(u32, Modification)>,
state: &mut BaseModState,
) -> Result<(), BaseModError> {
let canonical = parse_canonical_base(*entry.first().ok_or(BaseModError::MissingModCode)?)?;
let strand = parse_strand(*entry.get(1).ok_or(BaseModError::MissingModCode)?)?;
let body = entry.get(2..).unwrap_or(&[]);
let (mod_types, mode, deltas_bytes) = parse_body(body)?;
let deltas = parse_deltas(deltas_bytes)?;
let total = deltas
.len()
.checked_mul(mod_types.len())
.ok_or(BaseModError::MlLengthMismatch { ml: ml.len(), mm: usize::MAX })?;
let ml_end = ml_cursor
.checked_add(total)
.ok_or(BaseModError::MlLengthMismatch { ml: ml.len(), mm: usize::MAX })?;
if ml_end > ml.len() {
return Err(BaseModError::MlLengthMismatch { ml: ml.len(), mm: ml_end });
}
#[allow(clippy::indexing_slicing, reason = "bounds checked immediately above")]
let ml_slice = &ml[*ml_cursor..ml_end];
*ml_cursor = ml_end;
resolve_and_emit(
canonical, strand, &mod_types, mode, &deltas, ml_slice, seq, is_reverse, pending, state,
)?;
Ok(())
}
fn parse_canonical_base(byte: u8) -> Result<Base, BaseModError> {
match byte | 0x20 {
b'a' => Ok(Base::A),
b'c' => Ok(Base::C),
b'g' => Ok(Base::G),
b't' => Ok(Base::T),
b'n' => Err(BaseModError::UnsupportedUnknownBase),
_ => Err(BaseModError::InvalidCanonicalBase(byte)),
}
}
fn parse_strand(byte: u8) -> Result<ModStrand, BaseModError> {
match byte {
b'+' => Ok(ModStrand::Plus),
b'-' => Ok(ModStrand::Minus),
other => Err(BaseModError::InvalidStrand(other)),
}
}
fn parse_body(body: &[u8]) -> Result<(Vec<ModType>, ModMode, &[u8]), BaseModError> {
let first = *body.first().ok_or(BaseModError::MissingModCode)?;
let (mod_types, after_codes) = if first.is_ascii_digit() {
let end = body.iter().position(|b| !b.is_ascii_digit()).unwrap_or(body.len());
#[allow(clippy::indexing_slicing, reason = "end <= body.len() by construction")]
let digits = &body[..end];
let s = std::str::from_utf8(digits).map_err(|_| BaseModError::InvalidChebi)?;
let id: u32 = s.parse().map_err(|_| BaseModError::InvalidChebi)?;
(vec![ModType::ChEBI(id)], body.get(end..).unwrap_or(&[]))
} else {
let mut codes = Vec::new();
let mut i = 0;
while let Some(&b) = body.get(i) {
if matches!(b, b',' | b';' | b'.' | b'?') {
break;
}
if !b.is_ascii_alphabetic() {
return Err(BaseModError::InvalidModCode(b));
}
codes.push(ModType::Code(b));
i = i.saturating_add(1);
}
if codes.is_empty() {
return Err(BaseModError::MissingModCode);
}
(codes, body.get(i..).unwrap_or(&[]))
};
let (mode, remaining) = match after_codes.first() {
Some(b'.') => (ModMode::Unmodified, after_codes.get(1..).unwrap_or(&[])),
Some(b'?') => (ModMode::Ambiguous, after_codes.get(1..).unwrap_or(&[])),
_ => (ModMode::Implicit, after_codes),
};
Ok((mod_types, mode, remaining))
}
fn parse_deltas(bytes: &[u8]) -> Result<Vec<u32>, BaseModError> {
let mut out = Vec::new();
let mut rest = bytes;
while let Some((&first, tail)) = rest.split_first() {
if first != b',' {
return Err(BaseModError::InvalidDelta);
}
let end = tail.iter().position(|b| !b.is_ascii_digit()).unwrap_or(tail.len());
if end == 0 {
return Err(BaseModError::InvalidDelta);
}
#[allow(clippy::indexing_slicing, reason = "end > 0 and end <= tail.len()")]
let digits = &tail[..end];
let s = std::str::from_utf8(digits).map_err(|_| BaseModError::InvalidDelta)?;
let n: u32 = s.parse().map_err(|_| BaseModError::InvalidDelta)?;
out.push(n);
rest = tail.get(end..).unwrap_or(&[]);
}
Ok(out)
}
#[allow(clippy::too_many_arguments, reason = "per-entry resolution is a local helper")]
fn resolve_and_emit(
canonical: Base,
strand: ModStrand,
mod_types: &[ModType],
mode: ModMode,
deltas: &[u32],
ml_slice: &[u8],
seq: &[Base],
is_reverse: bool,
pending: &mut Vec<(u32, Modification)>,
state: &mut BaseModState,
) -> Result<(), BaseModError> {
if let Some(idx) = canonical.known_index()
&& mode == ModMode::Unmodified
{
#[allow(clippy::indexing_slicing, reason = "known_index() returns Some(0..=3)")]
{
state.any_unmodified_mode[idx] = true;
}
}
if deltas.is_empty() {
return Ok(());
}
let target = if is_reverse { canonical.inverse() } else { canonical };
let seq_len = seq.len();
let mut delta_idx: usize = 0;
let mut remaining: u32 = *deltas.first().ok_or(BaseModError::InvalidDelta)?;
let num_mods = mod_types.len();
let mut stored_idx: usize = if is_reverse { seq_len.saturating_sub(1) } else { 0 };
let mut steps: usize = 0;
while steps < seq_len && delta_idx < deltas.len() {
#[allow(
clippy::indexing_slicing,
reason = "stored_idx < seq_len by the while guard / decrement logic"
)]
let b = seq[stored_idx];
if b == target {
if remaining == 0 {
let qp = try_qpos(stored_idx, seq_len)?;
let base_offset = delta_idx
.checked_mul(num_mods)
.ok_or(BaseModError::MlLengthMismatch { ml: ml_slice.len(), mm: usize::MAX })?;
for (k, mt) in mod_types.iter().enumerate() {
let ml_idx =
base_offset.checked_add(k).ok_or(BaseModError::MlLengthMismatch {
ml: ml_slice.len(),
mm: usize::MAX,
})?;
let prob = *ml_slice.get(ml_idx).ok_or(BaseModError::MlLengthMismatch {
ml: ml_slice.len(),
mm: usize::MAX,
})?;
pending.push((
qp,
Modification {
mod_type: *mt,
probability: prob,
canonical_base: canonical,
strand,
},
));
}
let _ = mode;
delta_idx = delta_idx.saturating_add(1);
if delta_idx >= deltas.len() {
break;
}
#[allow(clippy::indexing_slicing, reason = "delta_idx < deltas.len()")]
{
remaining = deltas[delta_idx];
}
} else {
remaining = remaining.saturating_sub(1);
}
}
steps = steps.saturating_add(1);
if is_reverse {
if stored_idx == 0 {
break;
}
stored_idx = stored_idx.saturating_sub(1);
} else {
stored_idx = stored_idx.saturating_add(1);
}
}
if delta_idx < deltas.len() {
#[allow(clippy::indexing_slicing, reason = "delta_idx < deltas.len() by the guard")]
return Err(BaseModError::DeltaOutOfRange {
delta: deltas[delta_idx],
base: canonical,
is_reverse,
});
}
Ok(())
}
fn try_qpos(stored_idx: usize, seq_len: usize) -> Result<u32, BaseModError> {
u32::try_from(stored_idx).map_err(|_| BaseModError::SeqTooLong { len: seq_len })
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::indexing_slicing,
clippy::arithmetic_side_effects,
clippy::cast_possible_truncation,
reason = "test code"
)]
mod tests {
use super::*;
use seqair_types::Base::{A, C, G, T};
fn seq(bases: &[Base]) -> Vec<Base> {
bases.to_vec()
}
#[test]
fn parses_simple_cpg_entry() {
let s = seq(&[A, C, G, T, A, C, G, C, G, A, C, G, T]);
let state = BaseModState::parse(b"C+m,0,2;", &[200, 180], &s, false).unwrap();
assert_eq!(state.len(), 2);
let m1 = state.mod_at_qpos(1).unwrap();
assert_eq!(m1.len(), 1);
assert_eq!(m1[0].probability, 200);
assert!(matches!(m1[0].mod_type, ModType::Code(b'm')));
assert_eq!(m1[0].canonical_base, C);
assert!(matches!(m1[0].strand, ModStrand::Plus));
let m2 = state.mod_at_qpos(10).unwrap();
assert_eq!(m2[0].probability, 180);
}
#[test]
fn delta_zero_is_first_occurrence() {
let s = seq(&[A, C, C, C]);
let state = BaseModState::parse(b"C+m,0;", &[128], &s, false).unwrap();
assert_eq!(state.mod_at_qpos(1).unwrap()[0].probability, 128);
assert!(state.mod_at_qpos(0).is_none());
assert!(state.mod_at_qpos(2).is_none());
}
#[test]
fn reverse_strand_uses_complement_counting_from_end() {
let stored = seq(&[C, G, T, A, C, G, T]);
let state = BaseModState::parse(b"C+m,0,0;", &[200, 210], &stored, true).unwrap();
let m_first = state.mod_at_qpos(5).unwrap();
assert_eq!(m_first[0].probability, 200);
let m_second = state.mod_at_qpos(1).unwrap();
assert_eq!(m_second[0].probability, 210);
}
#[test]
fn parses_multiple_entries() {
let s = seq(&[A, C, G, C, G, A, C, G]);
let state = BaseModState::parse(b"C+m,0;C+h,1;", &[200, 150], &s, false).unwrap();
assert!(matches!(state.mod_at_qpos(1).unwrap()[0].mod_type, ModType::Code(b'm')));
assert!(matches!(state.mod_at_qpos(3).unwrap()[0].mod_type, ModType::Code(b'h')));
}
#[test]
fn combined_codes_interleave_ml() {
let s = seq(&[A, C, A, C, A, C, A, C]);
let state = BaseModState::parse(b"C+mh,0,2;", &[200, 100, 220, 120], &s, false).unwrap();
let at1 = state.mod_at_qpos(1).unwrap();
assert_eq!(at1.len(), 2);
assert!(matches!(at1[0].mod_type, ModType::Code(b'm')));
assert_eq!(at1[0].probability, 200);
assert!(matches!(at1[1].mod_type, ModType::Code(b'h')));
assert_eq!(at1[1].probability, 100);
let at7 = state.mod_at_qpos(7).unwrap();
assert_eq!(at7.len(), 2);
assert_eq!(at7[0].probability, 220);
assert_eq!(at7[1].probability, 120);
}
#[test]
fn parses_chebi_code() {
let s = seq(&[A, C, G, T]);
let state = BaseModState::parse(b"C+27551,0;", &[200], &s, false).unwrap();
assert!(matches!(state.mod_at_qpos(1).unwrap()[0].mod_type, ModType::ChEBI(27551)));
}
#[test]
fn is_unmodified_mixed_mode_same_canonical_is_per_canonical() {
let s = seq(&[A, C, A, C, A, C, A, C]);
let state = BaseModState::parse(b"C+m.,0;C+h,1;", &[200, 150], &s, false).unwrap();
assert_eq!(state.is_unmodified(1, C), Some(false));
assert_eq!(state.is_unmodified(3, C), Some(false));
assert_eq!(
state.is_unmodified(5, C),
Some(true),
"is_unmodified is per-canonical-base, not per-mod-type"
);
}
#[test]
fn is_unmodified_three_modes() {
let s = seq(&[A, C, G, C, G, C]);
let implicit = BaseModState::parse(b"C+m,0;", &[200], &s, false).unwrap();
assert_eq!(implicit.is_unmodified(1, C), Some(false));
assert_eq!(implicit.is_unmodified(3, C), None);
let explicit = BaseModState::parse(b"C+m.,0;", &[200], &s, false).unwrap();
assert_eq!(explicit.is_unmodified(1, C), Some(false));
assert_eq!(explicit.is_unmodified(3, C), Some(true));
let ambiguous = BaseModState::parse(b"C+m?,0;", &[200], &s, false).unwrap();
assert_eq!(ambiguous.is_unmodified(3, C), None);
}
#[test]
fn query_refpos_via_cigar() {
use super::super::cigar::{CigarMapping, CigarOp, CigarOpType};
use seqair_types::Pos0;
let s = seq(&[A, C, G, C, G, A, C, G, T, A]);
let state = BaseModState::parse(b"C+m,0,0;", &[200, 180], &s, false).unwrap();
let cigar = [CigarOp::new(CigarOpType::Match, 10)];
let cm = CigarMapping::new(Pos0::new(1000).unwrap(), &cigar).unwrap();
let m = state.mod_at_ref_pos(Pos0::new(1001).unwrap(), &cm).unwrap();
assert_eq!(m[0].probability, 200);
let m2 = state.mod_at_ref_pos(Pos0::new(1003).unwrap(), &cm).unwrap();
assert_eq!(m2[0].probability, 180);
assert!(state.mod_at_ref_pos(Pos0::new(1002).unwrap(), &cm).is_none());
}
#[test]
fn validation_ml_length_mismatch() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C+m,0;", &[200, 180], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::MlLengthMismatch { .. }), "got {err:?}");
}
#[test]
fn validation_missing_terminator() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C+m,0", &[200], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::MissingTerminator), "got {err:?}");
}
#[test]
fn validation_delta_out_of_range() {
let s = seq(&[A, C]); let err = BaseModState::parse(b"C+m,5;", &[200], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::DeltaOutOfRange { .. }), "got {err:?}");
}
#[test]
fn validation_bad_canonical() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"Z+m,0;", &[200], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::InvalidCanonicalBase(_)), "got {err:?}");
}
#[test]
fn validation_bad_strand() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C*m,0;", &[200], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::InvalidStrand(_)), "got {err:?}");
}
#[test]
fn empty_mm_empty_ml_ok() {
let s = seq(&[A, C, G]);
let state = BaseModState::parse(b"", &[], &s, false).unwrap();
assert!(state.is_empty());
assert!(state.mod_at_qpos(1).is_none());
assert_eq!(state.is_unmodified(1, C), None);
}
#[test]
fn trailing_nul_tolerated() {
let s = seq(&[A, C, G]);
let state = BaseModState::parse(b"C+m,0;\0", &[200], &s, false).unwrap();
assert_eq!(state.len(), 1);
}
#[test]
fn strand_minus_parses() {
let s = seq(&[A, C, G]);
let state = BaseModState::parse(b"C-m,0;", &[200], &s, false).unwrap();
assert!(matches!(state.mod_at_qpos(1).unwrap()[0].strand, ModStrand::Minus));
}
#[test]
fn lowercase_canonical_base() {
let s = seq(&[A, C, G]);
let state = BaseModState::parse(b"c+m,0;", &[200], &s, false).unwrap();
assert_eq!(state.mod_at_qpos(1).unwrap()[0].canonical_base, C);
}
#[test]
fn two_entries_same_canonical_sort_by_qpos() {
let s = seq(&[A, C, G, C, G, C]);
let state = BaseModState::parse(b"C+m,2;C+h,0;", &[200, 100], &s, false).unwrap();
assert!(matches!(state.mod_at_qpos(1).unwrap()[0].mod_type, ModType::Code(b'h')));
assert!(matches!(state.mod_at_qpos(5).unwrap()[0].mod_type, ModType::Code(b'm')));
}
#[test]
fn validation_invalid_delta_empty() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C+m,;", &[], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::InvalidDelta), "got {err:?}");
}
#[test]
fn validation_missing_mod_code_empty_body() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C+;", &[], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::MissingModCode), "got {err:?}");
}
#[test]
fn validation_missing_mod_code_only_delimiter() {
let s = seq(&[A, C, G]);
let err = BaseModState::parse(b"C+,0;", &[200], &s, false).unwrap_err();
assert!(matches!(err, BaseModError::MissingModCode), "got {err:?}");
}
#[test]
fn try_qpos_accepts_in_range_indices() {
assert_eq!(try_qpos(0, 1).unwrap(), 0);
assert_eq!(try_qpos(u32::MAX as usize, u32::MAX as usize + 1).unwrap(), u32::MAX);
}
use crate::bam::aux_data::AuxData;
use crate::bam::record_store::RecordStore;
use seqair_types::{BamFlags, Pos0};
fn push_record(bases: &[Base], aux: &[u8]) -> (RecordStore<()>, u32) {
let mut store: RecordStore<()> = RecordStore::new();
let pos = Pos0::new(100).unwrap();
let qual = vec![30u8; bases.len()];
let idx = store
.push_fields(
pos,
pos,
BamFlags::empty(),
30,
bases.len() as u32,
0,
b"r1",
&[],
bases,
&qual,
aux,
0,
-1,
-1,
0,
&mut (),
)
.unwrap()
.unwrap();
(store, idx)
}
#[test]
fn from_record_with_mm_and_ml_returns_some() {
let bases = vec![A, C, G, T, A, C, G, C, G, A, C, G, T];
let mut aux = AuxData::new();
aux.set_string(*b"MM", b"C+m,0,2;");
aux.set_array_u8(*b"ML", &[200, 180]).unwrap();
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let from_rec =
BaseModState::from_record(rec, &store).unwrap().expect("MM tag present, expected Some");
let expected = BaseModState::parse(b"C+m,0,2;", &[200, 180], &bases, false).unwrap();
assert_eq!(from_rec.len(), expected.len());
let m_from = from_rec.mod_at_qpos(1).unwrap();
let m_exp = expected.mod_at_qpos(1).unwrap();
assert_eq!(m_from.len(), m_exp.len());
assert_eq!(m_from[0].probability, m_exp[0].probability);
assert_eq!(m_from[0].canonical_base, m_exp[0].canonical_base);
}
#[test]
fn from_record_with_mm_only_uses_empty_ml() {
let bases = vec![A, C, G];
let mut aux = AuxData::new();
aux.set_string(*b"MM", b"C+m,0;");
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let err = BaseModState::from_record(rec, &store).unwrap_err();
assert!(
matches!(err, FromRecordError::Parse(BaseModError::MlLengthMismatch { .. })),
"got {err:?}"
);
let mut aux2 = AuxData::new();
aux2.set_string(*b"MM", b"C+m;");
let (store2, idx2) = push_record(&bases, aux2.as_bytes());
let rec2 = store2.record(idx2);
let state = BaseModState::from_record(rec2, &store2).unwrap().expect("MM present");
assert!(state.is_empty());
}
#[test]
fn from_record_without_mm_returns_none() {
let bases = vec![A, C, G];
let mut aux = AuxData::new();
aux.set_string(*b"RG", b"grp1");
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let state = BaseModState::from_record(rec, &store).unwrap();
assert!(state.is_none(), "expected None when MM absent, got Some");
}
#[test]
fn from_record_with_malformed_mm_returns_err() {
let bases = vec![A, C, G];
let mut aux = AuxData::new();
aux.set_string(*b"MM", b"C+m,0");
aux.set_array_u8(*b"ML", &[200]).unwrap();
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let err = BaseModState::from_record(rec, &store).unwrap_err();
assert!(
matches!(err, FromRecordError::Parse(BaseModError::MissingTerminator)),
"got {err:?}"
);
}
#[test]
fn from_record_with_wrong_mm_type_returns_err() {
let bases = vec![A, C, G];
let mut aux = AuxData::new();
aux.set_int(*b"MM", 42).unwrap();
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let err = BaseModState::from_record(rec, &store).unwrap_err();
assert!(matches!(err, FromRecordError::BadMmTag(_)), "got {err:?}");
}
#[test]
fn from_record_with_wrong_ml_type_returns_err() {
let bases = vec![A, C, G];
let mut aux = AuxData::new();
aux.set_string(*b"MM", b"C+m,0;");
aux.set_array_u16(*b"ML", &[200]).unwrap();
let (store, idx) = push_record(&bases, aux.as_bytes());
let rec = store.record(idx);
let err = BaseModState::from_record(rec, &store).unwrap_err();
assert!(matches!(err, FromRecordError::BadMlTag { got: "B:S" }), "got {err:?}");
}
#[test]
fn from_record_reverse_strand_uses_complement() {
let bases = vec![C, G, T, A, C, G, T];
let mut aux = AuxData::new();
aux.set_string(*b"MM", b"C+m,0,0;");
aux.set_array_u8(*b"ML", &[200, 210]).unwrap();
let mut store: RecordStore<()> = RecordStore::new();
let pos = Pos0::new(100).unwrap();
let qual = vec![30u8; bases.len()];
let mut flags = BamFlags::empty();
flags.set(seqair_types::bam_flags::consts::FLAG_REVERSE);
let idx = store
.push_fields(
pos,
pos,
flags,
30,
bases.len() as u32,
0,
b"r1",
&[],
&bases,
&qual,
aux.as_bytes(),
0,
-1,
-1,
0,
&mut (),
)
.unwrap()
.unwrap();
let rec = store.record(idx);
let state = BaseModState::from_record(rec, &store).unwrap().expect("MM present");
assert_eq!(state.mod_at_qpos(5).unwrap()[0].probability, 200);
assert_eq!(state.mod_at_qpos(1).unwrap()[0].probability, 210);
}
use proptest::prelude::*;
proptest! {
#[test]
fn proptest_n_canonical_rejected(case in prop_oneof![Just(b'N'), Just(b'n')]) {
let s = seq(&[A, C, G]);
let mm = [case, b'+', b'm', b',', b'0', b';'];
let err = BaseModState::parse(&mm, &[200], &s, false).unwrap_err();
prop_assert!(matches!(err, BaseModError::UnsupportedUnknownBase), "got {:?}", err);
}
#[test]
fn proptest_invalid_canonical_base_byte(byte in any::<u8>().prop_filter(
"exclude valid ACGT/N (any case) and the entry terminator",
|b| !matches!(b | 0x20, b'a' | b'c' | b'g' | b't' | b'n') && *b != b';',
)) {
let s = seq(&[A, C, G]);
let mm = [byte, b'+', b'm', b',', b'0', b';'];
let err = BaseModState::parse(&mm, &[200], &s, false).unwrap_err();
prop_assert!(
matches!(err, BaseModError::InvalidCanonicalBase(b) if b == byte),
"expected InvalidCanonicalBase({byte:#x}), got {err:?}",
);
}
#[test]
fn proptest_invalid_mod_code_byte(byte in any::<u8>().prop_filter(
"exclude alpha, digits, and mod-body delimiters",
|b| !b.is_ascii_alphabetic()
&& !b.is_ascii_digit()
&& !matches!(*b, b',' | b';' | b'.' | b'?'),
)) {
let s = seq(&[A, C, G]);
let mm = [b'C', b'+', byte, b',', b'0', b';'];
let err = BaseModState::parse(&mm, &[200], &s, false).unwrap_err();
prop_assert!(
matches!(err, BaseModError::InvalidModCode(b) if b == byte),
"expected InvalidModCode({byte:#x}), got {err:?}",
);
}
#[test]
fn proptest_invalid_delta_non_numeric(byte in any::<u8>().prop_filter(
"non-digit, not the entry terminator",
|b| !b.is_ascii_digit() && *b != b';',
)) {
let s = seq(&[A, C, G]);
let mm = [b'C', b'+', b'm', b',', byte, b';'];
let err = BaseModState::parse(&mm, &[200], &s, false).unwrap_err();
prop_assert!(matches!(err, BaseModError::InvalidDelta), "got {err:?}");
}
#[cfg(target_pointer_width = "64")]
#[test]
fn proptest_try_qpos_rejects_overflow(
stored_idx in (u64::from(u32::MAX) + 1)..=u64::from(u32::MAX) + 1_000_000,
extra in 0u64..=1_000_000,
) {
let seq_len = (stored_idx + extra) as usize;
let stored_idx = stored_idx as usize;
match try_qpos(stored_idx, seq_len) {
Err(BaseModError::SeqTooLong { len }) => prop_assert_eq!(len, seq_len),
other => prop_assert!(false, "expected SeqTooLong, got {:?}", other),
}
}
#[test]
fn proptest_chebi_overflow_rejected(
id in (u64::from(u32::MAX) + 1)..=u64::MAX,
) {
let s = seq(&[A, C, G]);
let mut mm: Vec<u8> = b"C+".to_vec();
mm.extend_from_slice(id.to_string().as_bytes());
mm.extend_from_slice(b",0;");
let err = BaseModState::parse(&mm, &[200], &s, false).unwrap_err();
prop_assert!(matches!(err, BaseModError::InvalidChebi), "got {err:?}");
}
}
}