1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
//! Data structures and functions for working with CIGAR strings.
use std::fmt;
/// A match/mismatch, insertion, or deletion operation.
///
/// When aligning `q` against `r`, this represents the edit operations to get from `r` to `q`.
#[derive(Debug, PartialEq, Copy, Clone)]
#[repr(u8)]
pub enum Operation {
/// Placeholder variant.
Sentinel = 0u8,
/// Match or mismatch.
///
/// This is a diagonal transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
M = 1u8,
/// Match.
Eq = 2u8,
/// Mismatch.
X = 3u8,
/// Insertion.
///
/// When aligning sequences `q` against `r`, this is a gap in `r`.
/// This is a row transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
I = 4u8,
/// Deletion.
///
/// When aligning sequences `q` against `r`, this is a gap in `q`.
/// This is a column transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
D = 5u8
}
/// An operation and how many times that operation is repeated.
#[derive(Debug, Copy, Clone)]
#[repr(C)]
pub struct OpLen {
pub op: Operation,
pub len: usize
}
/// A CIGAR string that holds a list of operations.
pub struct Cigar {
s: Vec<OpLen>,
idx: usize
}
impl Cigar {
/// Create a new CIGAR string with certain maximum lengths for the aligned sequences.
pub fn new(query_len: usize, reference_len: usize) -> Self {
let s = vec![OpLen { op: Operation::Sentinel, len: 0 }; query_len + reference_len + 5];
// first element should always be a sentinel
let idx = 1;
Cigar { s, idx }
}
/// Clear this CIGAR string.
#[allow(dead_code)]
pub(crate) fn clear(&mut self, query_len: usize, reference_len: usize) {
self.s[..query_len + reference_len + 5].fill(OpLen { op: Operation::Sentinel, len: 0 });
self.idx = 1;
}
/// Branchlessly add a new operation (in reverse order).
///
/// Other methods should allow the CIGAR string to be viewed
/// in the correct (not reversed) order.
///
/// The total number of added operations must not exceed the
/// maximum length the CIGAR string was created with.
#[allow(dead_code)]
pub(crate) unsafe fn add(&mut self, op: Operation) {
debug_assert!(self.idx < self.s.len());
// branchlessly append one operation
// make sure that contiguous operations are run-length encoded
let add = (op != (*self.s.as_ptr().add(self.idx - 1)).op) as usize;
self.idx += add;
(*self.s.as_mut_ptr().add(self.idx - 1)).op = op;
(*self.s.as_mut_ptr().add(self.idx - 1)).len += 1;
}
/// Length of the CIGAR string, not including the first sentinel.
pub fn len(&self) -> usize {
self.idx - 1
}
/// Get a certain operation in the CIGAR string.
pub fn get(&self, i: usize) -> OpLen {
self.s[self.idx - 1 - i]
}
/// Generate two strings to visualize the edit operations.
pub fn format(&self, q: &[u8], r: &[u8]) -> (String, String) {
let mut a = String::with_capacity(self.idx);
let mut b = String::with_capacity(self.idx);
let mut i = 0;
let mut j = 0;
for &op_len in self.s[1..self.idx].iter().rev() {
match op_len.op {
Operation::M | Operation::Eq | Operation::X => {
for _k in 0..op_len.len {
a.push(q[i] as char);
b.push(r[j] as char);
i += 1;
j += 1;
}
},
Operation::I => {
for _k in 0..op_len.len {
a.push(q[i] as char);
b.push('-');
i += 1;
}
},
Operation::D => {
for _k in 0..op_len.len {
a.push('-');
b.push(r[j] as char);
j += 1;
}
},
_ => continue
}
}
(a, b)
}
/// Create a copy of the operations in the CIGAR string and
/// ensure that the vector is provided in the correct order.
///
/// Sentinels are removed.
pub fn to_vec(&self) -> Vec<OpLen> {
self.s[1..self.idx]
.iter()
.rev()
.map(|&op_len| op_len)
.collect::<Vec<OpLen>>()
}
}
impl fmt::Display for Cigar {
/// Print a CIGAR string in standard CIGAR format.
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for &op_len in self.s[1..self.idx].iter().rev() {
let c = match op_len.op {
Operation::M => 'M',
Operation::Eq => '=',
Operation::X => 'X',
Operation::I => 'I',
Operation::D => 'D',
_ => continue
};
write!(f, "{}{}", op_len.len, c)?;
}
Ok(())
}
}