use std::sync::Arc;
use crate::core::locus::Position;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CigarOpKind {
Match,
Insertion,
Deletion,
RefSkip,
SoftClip,
HardClip,
Pad,
}
impl CigarOpKind {
pub fn consumes_ref(self) -> bool {
matches!(
self,
CigarOpKind::Match | CigarOpKind::Deletion | CigarOpKind::RefSkip
)
}
pub fn consumes_read(self) -> bool {
matches!(
self,
CigarOpKind::Match | CigarOpKind::Insertion | CigarOpKind::SoftClip
)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct CigarOp {
pub kind: CigarOpKind,
pub len: u32,
}
impl CigarOp {
pub fn new(kind: CigarOpKind, len: u32) -> Self {
Self { kind, len }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct SamFlags(pub u16);
impl SamFlags {
pub const PAIRED: u16 = 0x1;
pub const PROPER_PAIR: u16 = 0x2;
pub const UNMAPPED: u16 = 0x4;
pub const REVERSE: u16 = 0x10;
pub const SECONDARY: u16 = 0x100;
pub const DUPLICATE: u16 = 0x400;
pub const SUPPLEMENTARY: u16 = 0x800;
pub fn contains(self, bit: u16) -> bool {
self.0 & bit != 0
}
pub fn is_unmapped(self) -> bool {
self.contains(Self::UNMAPPED)
}
pub fn is_reverse(self) -> bool {
self.contains(Self::REVERSE)
}
pub fn is_secondary(self) -> bool {
self.contains(Self::SECONDARY)
}
pub fn is_supplementary(self) -> bool {
self.contains(Self::SUPPLEMENTARY)
}
pub fn is_duplicate(self) -> bool {
self.contains(Self::DUPLICATE)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct RefBase {
pub ref_pos: u32,
pub read_offset: usize,
}
#[derive(Debug, Clone)]
pub struct AlignedRead {
pub contig: u32,
pub pos: Position,
pub mapq: u8,
pub flags: SamFlags,
pub cigar: Vec<CigarOp>,
pub seq: Arc<[u8]>,
pub qual: Arc<[u8]>,
}
impl AlignedRead {
pub fn ref_span(&self) -> u32 {
self.cigar
.iter()
.filter(|o| o.kind.consumes_ref())
.map(|o| o.len)
.sum()
}
pub fn end(&self) -> u32 {
self.pos.0 + self.ref_span()
}
pub fn projected_bases(&self) -> Vec<RefBase> {
let mut out = Vec::new();
let mut ref_pos = self.pos.0;
let mut read_off: usize = 0;
for op in &self.cigar {
match op.kind {
CigarOpKind::Match => {
for _ in 0..op.len {
out.push(RefBase {
ref_pos,
read_offset: read_off,
});
ref_pos += 1;
read_off += 1;
}
}
CigarOpKind::Insertion | CigarOpKind::SoftClip => {
read_off += op.len as usize;
}
CigarOpKind::Deletion | CigarOpKind::RefSkip => {
ref_pos += op.len;
}
CigarOpKind::HardClip | CigarOpKind::Pad => {}
}
}
out
}
}
#[cfg(test)]
mod tests {
use super::*;
fn op(kind: CigarOpKind, len: u32) -> CigarOp {
CigarOp::new(kind, len)
}
fn read(pos: u32, cigar: Vec<CigarOp>, seq: &[u8]) -> AlignedRead {
AlignedRead {
contig: 0,
pos: Position(pos),
mapq: 60,
flags: SamFlags::default(),
cigar,
seq: Arc::from(seq.to_vec().into_boxed_slice()),
qual: Arc::from(vec![30u8; seq.len()].into_boxed_slice()),
}
}
#[test]
fn ref_span_and_end_are_cigar_derived_not_seq_len() {
let r = read(
100,
vec![
op(CigarOpKind::Match, 3),
op(CigarOpKind::Deletion, 1),
op(CigarOpKind::Match, 2),
],
b"ACGTT",
);
assert_eq!(r.ref_span(), 6);
assert_eq!(r.end(), 106);
}
#[test]
fn projection_handles_softclip_insertion_deletion_refskip() {
let r = read(
50,
vec![
op(CigarOpKind::SoftClip, 2),
op(CigarOpKind::Match, 3),
op(CigarOpKind::Insertion, 1),
op(CigarOpKind::Match, 2),
op(CigarOpKind::Deletion, 1),
op(CigarOpKind::Match, 2),
],
b"NNACGTTGAA", );
let proj = r.projected_bases();
let pairs: Vec<(u32, usize)> = proj.iter().map(|p| (p.ref_pos, p.read_offset)).collect();
assert_eq!(
pairs,
vec![
(50, 2),
(51, 3),
(52, 4), (53, 6),
(54, 7), (56, 8),
(57, 9), ]
);
assert!(!pairs
.iter()
.any(|(_, off)| *off == 0 || *off == 1 || *off == 5));
}
#[test]
fn refskip_consumes_reference_only_like_a_long_intron() {
let r = read(
0,
vec![
op(CigarOpKind::Match, 2),
op(CigarOpKind::RefSkip, 100),
op(CigarOpKind::Match, 2),
],
b"ACGT",
);
let proj = r.projected_bases();
assert_eq!(proj.first().unwrap().ref_pos, 0);
assert_eq!(proj.last().unwrap().ref_pos, 103);
assert_eq!(proj.len(), 4);
assert_eq!(r.ref_span(), 104);
}
#[test]
fn long_read_projects_every_match_base() {
let seq = vec![b'A'; 5000];
let r = read(1000, vec![op(CigarOpKind::Match, 5000)], &seq);
let proj = r.projected_bases();
assert_eq!(proj.len(), 5000);
assert_eq!(
proj[0],
RefBase {
ref_pos: 1000,
read_offset: 0
}
);
assert_eq!(
proj[4999],
RefBase {
ref_pos: 5999,
read_offset: 4999
}
);
}
#[test]
fn sam_flags_decode_common_bits() {
let f = SamFlags(SamFlags::REVERSE | SamFlags::DUPLICATE);
assert!(f.is_reverse());
assert!(f.is_duplicate());
assert!(!f.is_secondary());
assert!(!f.is_unmapped());
}
#[test]
fn empty_cigar_projects_nothing() {
let r = read(100, vec![], b"");
assert!(r.projected_bases().is_empty());
assert_eq!(r.ref_span(), 0);
assert_eq!(r.end(), 100);
}
#[test]
fn pad_consumes_neither_ref_nor_read() {
let r = read(
0,
vec![
op(CigarOpKind::Match, 2),
op(CigarOpKind::Pad, 1),
op(CigarOpKind::Match, 2),
],
b"ACGT",
);
let pairs: Vec<(u32, usize)> = r
.projected_bases()
.iter()
.map(|p| (p.ref_pos, p.read_offset))
.collect();
assert_eq!(pairs, vec![(0, 0), (1, 1), (2, 2), (3, 3)]);
assert_eq!(r.ref_span(), 4);
assert!(!CigarOpKind::Pad.consumes_ref());
assert!(!CigarOpKind::Pad.consumes_read());
}
#[test]
fn projection_offsets_follow_cigar_not_seq_len() {
let r = read(10, vec![op(CigarOpKind::Match, 3)], b"AC");
let offsets: Vec<usize> = r.projected_bases().iter().map(|p| p.read_offset).collect();
assert_eq!(offsets, vec![0, 1, 2]);
}
}