1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
//! Data structures and functions for working with CIGAR strings.

use std::fmt;

/// A match/mismatch, insertion, or deletion operation.
///
/// When aligning `q` against `r`, this represents the edit operations to get from `r` to `q`.
#[derive(Debug, PartialEq, Copy, Clone)]
#[repr(u8)]
pub enum Operation {
    /// Placeholder variant.
    Sentinel = 0u8,
    /// Match or mismatch.
    ///
    /// This is a diagonal transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
    M = 1u8,
    /// Match.
    Eq = 2u8,
    /// Mismatch.
    X = 3u8,
    /// Insertion.
    ///
    /// When aligning sequences `q` against `r`, this is a gap in `r`.
    /// This is a row transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
    I = 4u8,
    /// Deletion.
    ///
    /// When aligning sequences `q` against `r`, this is a gap in `q`.
    /// This is a column transition in the DP matrix with `|q| + 1` rows and `|r| + 1` columns.
    D = 5u8
}

/// An operation and how many times that operation is repeated.
#[derive(Debug, Copy, Clone)]
#[repr(C)]
pub struct OpLen {
    pub op: Operation,
    pub len: usize
}

/// A CIGAR string that holds a list of operations.
pub struct Cigar {
    s: Vec<OpLen>,
    idx: usize
}

impl Cigar {
    /// Create a new CIGAR string with certain maximum lengths for the aligned sequences.
    pub fn new(query_len: usize, reference_len: usize) -> Self {
        let s = vec![OpLen { op: Operation::Sentinel, len: 0 }; query_len + reference_len + 5];
        // first element should always be a sentinel
        let idx = 1;
        Cigar { s, idx }
    }

    /// Clear this CIGAR string.
    #[allow(dead_code)]
    pub(crate) fn clear(&mut self, query_len: usize, reference_len: usize) {
        self.s[..query_len + reference_len + 5].fill(OpLen { op: Operation::Sentinel, len: 0 });
        self.idx = 1;
    }

    /// Branchlessly add a new operation (in reverse order).
    ///
    /// Other methods should allow the CIGAR string to be viewed
    /// in the correct (not reversed) order.
    ///
    /// The total number of added operations must not exceed the
    /// maximum length the CIGAR string was created with.
    #[allow(dead_code)]
    pub(crate) unsafe fn add(&mut self, op: Operation) {
        debug_assert!(self.idx < self.s.len());
        // branchlessly append one operation
        // make sure that contiguous operations are run-length encoded
        let add = (op != (*self.s.as_ptr().add(self.idx - 1)).op) as usize;
        self.idx += add;
        (*self.s.as_mut_ptr().add(self.idx - 1)).op = op;
        (*self.s.as_mut_ptr().add(self.idx - 1)).len += 1;
    }

    /// Length of the CIGAR string, not including the first sentinel.
    pub fn len(&self) -> usize {
        self.idx - 1
    }

    /// Get a certain operation in the CIGAR string.
    pub fn get(&self, i: usize) -> OpLen {
        self.s[self.idx - 1 - i]
    }

    /// Generate two strings to visualize the edit operations.
    pub fn format(&self, q: &[u8], r: &[u8]) -> (String, String) {
        let mut a = String::with_capacity(self.idx);
        let mut b = String::with_capacity(self.idx);
        let mut i = 0;
        let mut j = 0;

        for &op_len in self.s[1..self.idx].iter().rev() {
            match op_len.op {
                Operation::M | Operation::Eq | Operation::X => {
                    for _k in 0..op_len.len {
                        a.push(q[i] as char);
                        b.push(r[j] as char);
                        i += 1;
                        j += 1;
                    }
                },
                Operation::I => {
                    for _k in 0..op_len.len {
                        a.push(q[i] as char);
                        b.push('-');
                        i += 1;
                    }
                },
                Operation::D => {
                    for _k in 0..op_len.len {
                        a.push('-');
                        b.push(r[j] as char);
                        j += 1;
                    }
                },
                _ => continue
            }
        }

        (a, b)
    }

    /// Create a copy of the operations in the CIGAR string and
    /// ensure that the vector is provided in the correct order.
    ///
    /// Sentinels are removed.
    pub fn to_vec(&self) -> Vec<OpLen> {
        self.s[1..self.idx]
            .iter()
            .rev()
            .map(|&op_len| op_len)
            .collect::<Vec<OpLen>>()
    }
}

impl fmt::Display for Cigar {
    /// Print a CIGAR string in standard CIGAR format.
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        for &op_len in self.s[1..self.idx].iter().rev() {
            let c = match op_len.op {
                Operation::M => 'M',
                Operation::Eq => '=',
                Operation::X => 'X',
                Operation::I => 'I',
                Operation::D => 'D',
                _ => continue
            };
            write!(f, "{}{}", op_len.len, c)?;
        }
        Ok(())
    }
}