Skip to main content

pa_types/
cigar.rs

1/*!
2Cigar string types.
3
4Uses SAM cigar conventions. See <https://timd.one/blog/genomics/cigar.php> (table below).
5
6In particular, an "insertion" is a character in the pattern/query (2nd `Pos` coordinate) that is not in the text/reference (first `Pos` coordinate), and opposite for deletions.
7
8We never output `M` characters, but do parse these into `=` and `X`, which need the corresponding text/pattern string to be resolved.
9
10Undetermined bases (`N`), clipping (`S`, `H`), and padding (`P`) are not supported.
11
12```text
13+----------------+----------------+----------------------------------------------+---------------+---------------+
14| Symbol         | Name           | Brief Description                            | Consumes Query| Consumes Ref  |
15+----------------+----------------+----------------------------------------------+---------------+---------------+
16| M              | Match          | No insertion or deletions, bases may differ  | ✓             | ✓             |
17| I              | Insertion      | Additional base in query (not in reference)  | ✓             | ✗             |
18| D              | Deletion       | Query is missing base from reference         | ✗             | ✓             |
19| =              | Equal          | No insertions or deletions, bases agree      | ✓             | ✓             |
20| X              | Not Equal      | No insertions or deletions, bases differ     | ✓             | ✓             |
21| N              | None           | No query bases to align (spliced read)       | ✗             | ✓             |
22| S              | Soft-Clipped   | Bases on end of read not aligned but stored  | ✓             | ✗             |
23| H              | Hard-Clipped   | Bases on end of read not aligned, not stored | ✗             | ✗             |
24| P              | Padding        | Neither read nor reference has a base        | ✗             | ✗             |
25+----------------+----------------+----------------------------------------------+---------------+---------------+
26```
27
28*/
29
30use itertools::Itertools;
31use serde::{Deserialize, Serialize};
32use std::fmt::Write;
33
34use crate::*;
35
36/// A single cigar string character.
37#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
38pub enum CigarOp {
39    /// `=`
40    Match,
41    /// `X`
42    Sub,
43    /// `D`
44    Del,
45    /// `I`
46    Ins,
47}
48
49/// A cigar string character with the corresponding characters from text and pattern.
50#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
51pub enum CigarOpChars {
52    Match(u8),
53    Sub(u8, u8),
54    Del(u8),
55    Ins(u8),
56}
57
58/// A single repeated cigar element, e.g. `5=` or `3I`.
59#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
60pub struct CigarElem {
61    pub op: CigarOp,
62    pub cnt: I,
63}
64
65impl CigarElem {
66    pub fn new(op: CigarOp, cnt: I) -> Self {
67        Self { op, cnt }
68    }
69}
70
71/// Main type representing a Cigar string.
72///
73/// Adjacent equal operations are typically merged into [`CigarElem`] with `cnt>1`.
74// This is similar to https://docs.rs/bio/1.0.0/bio/alignment/struct.Alignment.html,
75// but more specific for our use case.
76#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Default)]
77pub struct Cigar {
78    pub ops: Vec<CigarElem>,
79}
80
81impl CigarOp {
82    /// Convert to one of `=XID`.
83    pub fn to_char(&self) -> char {
84        match self {
85            CigarOp::Match => '=',
86            CigarOp::Sub => 'X',
87            CigarOp::Ins => 'I',
88            CigarOp::Del => 'D',
89        }
90    }
91
92    /// Convert operation to `(text_pos, pattern_pos)` delta/step of path indices.
93    #[inline(always)]
94    pub fn delta(&self) -> Pos {
95        match self {
96            CigarOp::Match | CigarOp::Sub => Pos(1, 1),
97            CigarOp::Del => Pos(1, 0), // deletion: consumes ref/text only
98            CigarOp::Ins => Pos(0, 1), // insertion: consumes pattern/query only
99        }
100    }
101
102    /// Converts path delta `(text_pos, pattern_pos)` to `CigarOp`.
103    ///
104    /// Ignores [`CigarOp::Sub`]: (1,1) is always [`CigarOp::Match`].
105    #[inline(always)]
106    pub fn from_delta(delta: Pos) -> Self {
107        match delta {
108            Pos(0, 1) => CigarOp::Ins,
109            Pos(1, 0) => CigarOp::Del,
110            Pos(1, 1) => CigarOp::Match,
111            _ => panic!("Invalid delta: {:?}", delta),
112        }
113    }
114}
115
116// Mostly to make verify/resolve matches clearer while still using the delta mapping
117impl std::ops::Mul<I> for Pos {
118    type Output = Pos;
119    fn mul(self, rhs: I) -> Pos {
120        Pos(self.0 * rhs, self.1 * rhs)
121    }
122}
123
124impl From<u8> for CigarOp {
125    /// Convert from `=MXID` to `CigarOp`.
126    ///
127    /// `M` is interpreted as `=` ([`CigarOp::Match`]) and should be resolved into `=` or `X` ([`CigarOp::Sub`]) via `resolve_matches`.
128    fn from(op: u8) -> Self {
129        match op {
130            b'=' | b'M' => CigarOp::Match,
131            b'X' => CigarOp::Sub,
132            b'I' => CigarOp::Ins,
133            b'D' => CigarOp::Del,
134            _ => panic!("Invalid CigarOp"),
135        }
136    }
137}
138
139impl ToString for Cigar {
140    /// Format the cigar to a `String`.
141    fn to_string(&self) -> String {
142        let mut s = String::new();
143        for elem in &self.ops {
144            write!(&mut s, "{}{}", elem.cnt, elem.op.to_char()).unwrap();
145        }
146        s
147    }
148}
149
150impl Cigar {
151    /// Concatenate the single-characters ops.
152    pub fn from_ops(ops: impl Iterator<Item = CigarOp>) -> Self {
153        Cigar {
154            ops: ops
155                .chunk_by(|&op| op)
156                .into_iter()
157                .map(|(op, group)| CigarElem::new(op, group.count() as _))
158                .collect(),
159        }
160    }
161
162    /// Create Cigar from path and corresponding sequences.
163    ///
164    /// Path must have `(text_pos, pattern_pos)` pairs.
165    /// To distinguish between match and sub it uses simple
166    /// equality (i.e. c1==c2) via `resolve_matches`, so IUPAC matching is not supported.
167    pub fn from_path(text: Seq, pattern: Seq, path: &Path) -> Cigar {
168        if path[0] != Pos(0, 0) {
169            panic!("Path must start at (0,0)!");
170        }
171        Self::resolve_matches(
172            path.iter()
173                .tuple_windows()
174                .map(|(&text_pos, &pattern_pos)| {
175                    CigarElem::new(CigarOp::from_delta(pattern_pos - text_pos), 1)
176                }),
177            text,
178            pattern,
179        )
180    }
181
182    /// Return the diff from pattern to text.
183    pub fn to_char_pairs<'s>(&'s self, text: &'s [u8], pattern: &'s [u8]) -> Vec<CigarOpChars> {
184        let mut pos = Pos(0, 0);
185        let fix_case = !(b'A' ^ b'a');
186        let mut out = vec![];
187        for el in &self.ops {
188            for _ in 0..el.cnt {
189                let c = match el.op {
190                    CigarOp::Match => {
191                        // NOTE: IUPAC characters can be matching even when they're not equal.
192                        // assert_eq!(
193                        //     (pattern[pos.0 as usize] & fix_case) as char,
194                        //     (text[pos.1 as usize] & fix_case) as char,
195                        //     "mismatch for {pos:?}"
196                        // );
197                        CigarOpChars::Match(text[pos.0 as usize])
198                    }
199                    CigarOp::Sub => {
200                        assert_ne!(
201                            (text[pos.0 as usize] & fix_case) as char,
202                            (pattern[pos.1 as usize] & fix_case) as char,
203                            "cigar {:?}\npattern {:?}\ntext    {:?}\nmismatch for {pos:?}",
204                            self.to_string(),
205                            String::from_utf8_lossy(pattern),
206                            String::from_utf8_lossy(text)
207                        );
208                        CigarOpChars::Sub(text[pos.0 as usize], pattern[pos.1 as usize])
209                    }
210                    CigarOp::Del => {
211                        // Note deletion consumes text hence text slice
212                        CigarOpChars::Del(text[pos.0 as usize])
213                    }
214                    CigarOp::Ins => {
215                        // Note insertion consumes pattern hence pattern slice
216                        CigarOpChars::Ins(pattern[pos.1 as usize])
217                    }
218                };
219                out.push(c);
220                pos += el.op.delta();
221            }
222        }
223        out
224    }
225
226    /// Get the `Path` corresponding to this [`Cigar`].
227    pub fn to_path(&self) -> Path {
228        let mut pos = Pos(0, 0);
229        let mut path = vec![pos];
230        for el in &self.ops {
231            for _ in 0..el.cnt {
232                pos += el.op.delta();
233                path.push(pos);
234            }
235        }
236        path
237    }
238
239    /// Get the `Path` and the alignment `Cost` to each position.
240    pub fn to_path_with_costs(&self, cm: CostModel) -> Vec<(Pos, Cost)> {
241        let mut pos = Pos(0, 0);
242        let mut cost = 0;
243        let mut path = vec![(pos, cost)];
244
245        for el in &self.ops {
246            match el.op {
247                CigarOp::Match => {
248                    for _ in 0..el.cnt {
249                        pos += el.op.delta();
250                        path.push((pos, cost));
251                    }
252                }
253                CigarOp::Sub => {
254                    for _ in 0..el.cnt {
255                        pos += el.op.delta();
256                        cost += cm.sub;
257                        path.push((pos, cost));
258                    }
259                }
260                CigarOp::Ins => {
261                    for len in 1..=(el.cnt as Cost) {
262                        pos += el.op.delta();
263                        path.push((pos, cost + cm.ins(len)));
264                    }
265                    cost += cm.ins(el.cnt);
266                }
267                CigarOp::Del => {
268                    for len in 1..=(el.cnt as Cost) {
269                        pos += el.op.delta();
270                        path.push((pos, cost + cm.del(len)));
271                    }
272                    cost += cm.del(el.cnt);
273                }
274            }
275        }
276        path
277    }
278
279    /// Push a [`CigarOp`] to the cigar.
280    pub fn push(&mut self, op: CigarOp) {
281        if let Some(s) = self.ops.last_mut() {
282            if s.op == op {
283                s.cnt += 1;
284                return;
285            }
286        }
287        self.ops.push(CigarElem { op, cnt: 1 });
288    }
289
290    /// Push a [`CigarElem`] to the cigar.
291    pub fn push_elem(&mut self, e: CigarElem) {
292        if let Some(s) = self.ops.last_mut() {
293            if s.op == e.op {
294                s.cnt += e.cnt;
295                return;
296            }
297        }
298        self.ops.push(e);
299    }
300
301    /// Push `cnt` matches.
302    pub fn push_matches(&mut self, cnt: I) {
303        if let Some(s) = self.ops.last_mut() {
304            if s.op == CigarOp::Match {
305                s.cnt += cnt;
306                return;
307            }
308        }
309        self.ops.push(CigarElem {
310            op: CigarOp::Match,
311            cnt: cnt as _,
312        });
313    }
314
315    /// Check that the cigar is valid between `text` and `pattern` and return the cost.
316    pub fn verify(&self, cm: &CostModel, text: Seq, pattern: Seq) -> Result<Cost, &str> {
317        let mut pos = Pos(0, 0);
318        let mut cost: Cost = 0;
319
320        for &CigarElem { op, cnt } in &self.ops {
321            match op {
322                CigarOp::Match => {
323                    for _ in 0..cnt {
324                        if text.get(pos.0 as usize) != pattern.get(pos.1 as usize) {
325                            return Err("Expected match but found substitution.");
326                        }
327                        pos += op.delta();
328                    }
329                }
330                CigarOp::Sub => {
331                    for _ in 0..cnt {
332                        if text.get(pos.0 as usize) == pattern.get(pos.1 as usize) {
333                            return Err("Expected substitution but found match.");
334                        }
335                        pos += op.delta();
336                        cost += cm.sub;
337                    }
338                }
339                CigarOp::Ins => {
340                    cost += cm.open + cnt as Cost * cm.extend;
341                    pos += op.delta() * cnt;
342                }
343                CigarOp::Del => {
344                    cost += cm.open + cnt as Cost * cm.extend;
345                    pos += op.delta() * cnt;
346                }
347            }
348        }
349        if pos != Pos(text.len() as I, pattern.len() as I) {
350            return Err("Wrong alignment length.");
351        }
352
353        Ok(cost)
354    }
355
356    /// Splits all 'M'/[`CigarOp::Match`] into matches (`=`) and substitutions (`X`), and joins consecutive equal elements.
357    pub fn resolve_matches(ops: impl Iterator<Item = CigarElem>, text: Seq, pattern: Seq) -> Self {
358        let mut pos = Pos(0, 0);
359        let mut c = Cigar { ops: vec![] };
360        for CigarElem { op, cnt } in ops {
361            match op {
362                CigarOp::Match => {
363                    for _ in 0..cnt {
364                        c.push(if text[pos.0 as usize] == pattern[pos.1 as usize] {
365                            CigarOp::Match
366                        } else {
367                            CigarOp::Sub
368                        });
369                        pos += op.delta();
370                    }
371                    continue;
372                }
373                _ => {
374                    pos += op.delta() * cnt;
375                }
376            };
377            c.push_elem(CigarElem { op, cnt });
378        }
379        c
380    }
381
382    /// A simpler parsing function that only parses strings of characters `M=XID`, without preceding counts.
383    /// Consecutive characters are grouped, and `M` and `=` chars are resolved into `=` and `X`.
384    pub fn parse_without_counts(s: &str, text: Seq, pattern: Seq) -> Self {
385        Self::resolve_matches(
386            s.as_bytes().iter().map(|&op| CigarElem {
387                op: op.into(),
388                cnt: 1,
389            }),
390            text,
391            pattern,
392        )
393    }
394
395    /// A simpler parsing function that only parses strings of characters `MXID`, without preceding counts.
396    /// Consecutive characters are grouped. `M` chars are *not* resolved and assumed to mean `=`.
397    pub fn parse_without_resolving(s: &str) -> Self {
398        let mut c = Cigar { ops: vec![] };
399        for &op in s.as_bytes() {
400            c.push(op.into())
401        }
402        c
403    }
404
405    /// Parse a Cigar string with optional counts
406    pub fn from_string(s: &str) -> Self {
407        let mut c = Cigar { ops: vec![] };
408        for slice in s.as_bytes().split_inclusive(|b| !b.is_ascii_digit()) {
409            let (&op, cnt_bytes) = slice.split_last().expect("Cigar string cannot be empty");
410            let cnt = if cnt_bytes.is_empty() {
411                1
412            } else {
413                unsafe { std::str::from_utf8_unchecked(cnt_bytes) }
414                    .parse()
415                    .expect("Invalid Cigar count")
416            };
417            c.push_elem(CigarElem { op: op.into(), cnt });
418        }
419        c
420    }
421
422    /// A more generic (and slower) parsing function that also allows optional counts, e.g. `5M2X3M`.
423    /// Consecutive characters are grouped, and `M` and `=` chars are resolved into `=` and `X`.
424    pub fn parse(s: &str, text: Seq, pattern: Seq) -> Self {
425        Self::resolve_matches(
426            s.as_bytes()
427                .split_inclusive(|pattern| pattern.is_ascii_alphabetic())
428                .map(|pattern_slice| {
429                    let (&op, cnt) = pattern_slice.split_last().unwrap();
430                    let cnt = if cnt.is_empty() {
431                        1
432                    } else {
433                        unsafe { std::str::from_utf8_unchecked(cnt) }
434                            .parse()
435                            .unwrap()
436                    };
437                    CigarElem { op: op.into(), cnt }
438                }),
439            text,
440            pattern,
441        )
442    }
443
444    /// Clear the internal vector.
445    pub fn clear(&mut self) {
446        self.ops.clear();
447    }
448
449    /// Reverse the cigar string, for `rc(text)` and `rc(pattern)`.
450    pub fn reverse(&mut self) {
451        self.ops.reverse();
452    }
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn test_delta() {
461        for op in [CigarOp::Match, CigarOp::Del, CigarOp::Ins] {
462            assert_eq!(CigarOp::from_delta(op.delta()), op);
463        }
464        assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Match);
465        //  assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Sub);
466        assert_eq!(CigarOp::from_delta(Pos(1, 0)), CigarOp::Del); // Consume text
467        assert_eq!(CigarOp::from_delta(Pos(0, 1)), CigarOp::Ins); // Consume pattern
468    }
469
470    #[test]
471    fn test_valid_eq() {
472        let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2)]);
473        assert_eq!(c.to_string(), "1=1X");
474    }
475
476    #[test]
477    fn test_invalid_end_length() {
478        // (0,1)is invalid, text is longer than path
479        let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(0, 1)]);
480        assert!(c.verify(&CostModel::unit(), b"ab", b"aa").is_err());
481    }
482
483    #[test]
484    fn to_string() {
485        let c = Cigar {
486            ops: vec![
487                CigarElem {
488                    op: CigarOp::Ins,
489                    cnt: 1,
490                },
491                CigarElem {
492                    op: CigarOp::Match,
493                    cnt: 2,
494                },
495            ],
496        };
497        assert_eq!(c.to_string(), "1I2=");
498    }
499
500    #[test]
501    fn from_path() {
502        let c = Cigar::from_path(
503            b"aaa",
504            b"aabc",
505            &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2), Pos(3, 3), Pos(3, 4)],
506        );
507        assert_eq!(c.to_string(), "2=1X1I");
508    }
509
510    #[test]
511    fn from_string_with_count() {
512        let c = Cigar::from_string("24=");
513        assert_eq!(c.ops.len(), 1);
514        assert_eq!(c.ops[0], CigarElem::new(CigarOp::Match, 24));
515    }
516
517    #[test]
518    fn from_string_mixed_ops() {
519        let c = Cigar::from_string("2=3I1X");
520        assert_eq!(
521            c.ops,
522            vec![
523                CigarElem::new(CigarOp::Match, 2),
524                CigarElem::new(CigarOp::Ins, 3),
525                CigarElem::new(CigarOp::Sub, 1)
526            ]
527        );
528    }
529
530    #[test]
531    fn from_string_no_counts() {
532        let c = Cigar::from_string("=XIDDD");
533        assert_eq!(
534            c.ops,
535            vec![
536                CigarElem::new(CigarOp::Match, 1),
537                CigarElem::new(CigarOp::Sub, 1),
538                CigarElem::new(CigarOp::Ins, 1),
539                CigarElem::new(CigarOp::Del, 3),
540            ]
541        );
542    }
543
544    #[test]
545    #[rustfmt::skip]
546    fn push_to_path() {
547        let mut c = Cigar::default();
548                                // 0 0
549        c.push(CigarOp::Match); // 1  1
550        c.push(CigarOp::Del);   // 2  1  (text +1)
551        c.push(CigarOp::Ins);   // 2  2  (pattern +1)
552        c.push(CigarOp::Sub);   // 3  3
553
554        assert_eq!(
555            c.to_path(),
556            [
557                Pos(0, 0),
558                Pos(1, 1),
559                Pos(2, 1),
560                Pos(2, 2),
561                Pos(3, 3),
562            ]
563        );
564    }
565
566    #[test]
567    fn to_char_pairs_all_match() {
568        let c = Cigar::from_string("3=");
569        let pairs = c.to_char_pairs(b"aaa", b"aaa");
570        assert_eq!(
571            pairs,
572            vec![
573                CigarOpChars::Match(b'a'),
574                CigarOpChars::Match(b'a'),
575                CigarOpChars::Match(b'a'),
576            ]
577        );
578    }
579
580    #[test]
581    fn to_char_pairs_sub() {
582        let c = Cigar::from_string("1X");
583        let pairs = c.to_char_pairs(b"a", b"c");
584        assert_eq!(pairs, vec![CigarOpChars::Sub(b'a', b'c')]);
585    }
586
587    #[test]
588    fn to_char_pairs_ins() {
589        let c = Cigar::from_string("1=1I1=");
590        let pairs = c.to_char_pairs(b"ac", b"abc");
591        assert_eq!(
592            pairs,
593            vec![
594                CigarOpChars::Match(b'a'),
595                CigarOpChars::Ins(b'b'),
596                CigarOpChars::Match(b'c'),
597            ]
598        );
599    }
600
601    #[test]
602    fn to_char_pairs_del() {
603        let c = Cigar::from_string("1=1D1=");
604        let pairs = c.to_char_pairs(b"abc", b"ac");
605        assert_eq!(
606            pairs,
607            vec![
608                CigarOpChars::Match(b'a'),
609                CigarOpChars::Del(b'b'),
610                CigarOpChars::Match(b'c'),
611            ]
612        );
613    }
614
615    #[test]
616    fn to_char_pairs_mixed() {
617        let c = Cigar::from_string("2=1X1I");
618        let pairs = c.to_char_pairs(b"abZd", b"abYc");
619        assert_eq!(
620            pairs,
621            vec![
622                CigarOpChars::Match(b'a'),
623                CigarOpChars::Match(b'b'),
624                CigarOpChars::Sub(b'Z', b'Y'),
625                CigarOpChars::Ins(b'c'),
626            ]
627        );
628    }
629}