block_aligner/
cigar.rs

1//! Data structures and functions for working with CIGAR strings.
2
3use std::fmt;
4
5/// A match/mismatch, insertion, or deletion operation.
6///
7/// When aligning `q` against `r`, this represents the edit operations to get from `r` to `q`.
8#[derive(Debug, PartialEq, Copy, Clone)]
9#[repr(u8)]
10pub enum Operation {
11    /// Placeholder variant.
12    Sentinel = 0u8,
13    /// Match or mismatch.
14    ///
15    /// This is a diagonal transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
16    M = 1u8,
17    /// Match.
18    Eq = 2u8,
19    /// Mismatch.
20    X = 3u8,
21    /// Insertion.
22    ///
23    /// When aligning sequences `q` against `r`, this is a gap in `r`.
24    /// This is a row transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
25    I = 4u8,
26    /// Deletion.
27    ///
28    /// When aligning sequences `q` against `r`, this is a gap in `q`.
29    /// This is a column transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
30    D = 5u8
31}
32
33/// An operation and how many times that operation is repeated.
34#[derive(Debug, Copy, Clone)]
35#[repr(C)]
36pub struct OpLen {
37    pub op: Operation,
38    pub len: usize
39}
40
41/// A CIGAR string that holds a list of operations.
42pub struct Cigar {
43    s: Vec<OpLen>,
44    idx: usize
45}
46
47impl Cigar {
48    /// Create a new CIGAR string with certain maximum lengths for the aligned sequences.
49    pub fn new(query_len: usize, reference_len: usize) -> Self {
50        let s = vec![OpLen { op: Operation::Sentinel, len: 0 }; query_len + reference_len + 5];
51        // first element should always be a sentinel
52        let idx = 1;
53        Cigar { s, idx }
54    }
55
56    /// Clear this CIGAR string.
57    #[allow(dead_code)]
58    pub(crate) fn clear(&mut self, query_len: usize, reference_len: usize) {
59        self.s[..query_len + reference_len + 5].fill(OpLen { op: Operation::Sentinel, len: 0 });
60        self.idx = 1;
61    }
62
63    /// Branchlessly add a new operation (in reverse order).
64    ///
65    /// Other methods should allow the CIGAR string to be viewed
66    /// in the correct (not reversed) order.
67    ///
68    /// The total number of added operations must not exceed the
69    /// maximum length the CIGAR string was created with.
70    #[allow(dead_code)]
71    pub(crate) unsafe fn add(&mut self, op: Operation) {
72        debug_assert!(self.idx < self.s.len());
73        // branchlessly append one operation
74        // make sure that contiguous operations are run-length encoded
75        let add = (op != (*self.s.as_ptr().add(self.idx - 1)).op) as usize;
76        self.idx += add;
77        (*self.s.as_mut_ptr().add(self.idx - 1)).op = op;
78        (*self.s.as_mut_ptr().add(self.idx - 1)).len += 1;
79    }
80
81    /// Reverse the CIGAR string in place.
82    pub fn reverse(&mut self) {
83        self.s[1..self.idx].reverse();
84    }
85
86    /// Length of the CIGAR string, not including the first sentinel.
87    pub fn len(&self) -> usize {
88        self.idx - 1
89    }
90
91    /// Get a certain operation in the CIGAR string.
92    pub fn get(&self, i: usize) -> OpLen {
93        self.s[self.idx - 1 - i]
94    }
95
96    /// Generate two strings to visualize the edit operations.
97    pub fn format(&self, q: &[u8], r: &[u8]) -> (String, String) {
98        let mut a = String::with_capacity(self.idx);
99        let mut b = String::with_capacity(self.idx);
100        let mut i = 0;
101        let mut j = 0;
102
103        for &op_len in self.s[1..self.idx].iter().rev() {
104            match op_len.op {
105                Operation::M | Operation::Eq | Operation::X => {
106                    for _k in 0..op_len.len {
107                        a.push(q[i] as char);
108                        b.push(r[j] as char);
109                        i += 1;
110                        j += 1;
111                    }
112                },
113                Operation::I => {
114                    for _k in 0..op_len.len {
115                        a.push(q[i] as char);
116                        b.push('-');
117                        i += 1;
118                    }
119                },
120                Operation::D => {
121                    for _k in 0..op_len.len {
122                        a.push('-');
123                        b.push(r[j] as char);
124                        j += 1;
125                    }
126                },
127                _ => continue
128            }
129        }
130
131        (a, b)
132    }
133
134    /// Create a copy of the operations in the CIGAR string and
135    /// ensure that the vector is provided in the correct order.
136    ///
137    /// Sentinels are removed.
138    pub fn to_vec(&self) -> Vec<OpLen> {
139        self.s[1..self.idx]
140            .iter()
141            .rev()
142            .map(|&op_len| op_len)
143            .collect::<Vec<OpLen>>()
144    }
145}
146
147impl fmt::Display for Cigar {
148    /// Print a CIGAR string in standard CIGAR format.
149    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
150        for &op_len in self.s[1..self.idx].iter().rev() {
151            let c = match op_len.op {
152                Operation::M => 'M',
153                Operation::Eq => '=',
154                Operation::X => 'X',
155                Operation::I => 'I',
156                Operation::D => 'D',
157                _ => continue
158            };
159            write!(f, "{}{}", op_len.len, c)?;
160        }
161        Ok(())
162    }
163}