Skip to main content

pa_types/
cigar.rs

1/*!
2Cigar string types.
3
4Uses SAM cigar conventions. See <https://timd.one/blog/genomics/cigar.php> (table below).
5
6In particular, an "insertion" is a character in the pattern/query (2nd `Pos` coordinate) that is not in the text/reference (first `Pos` coordinate), and opposite for deletions.
7
8We never output `M` characters, but do parse these into `=` and `X`, which need the corresponding text/pattern string to be resolved.
9
10Undetermined bases (`N`), clipping (`S`, `H`), and padding (`P`) are not supported.
11
12```text
13+----------------+----------------+----------------------------------------------+---------------+---------------+
14| Symbol         | Name           | Brief Description                            | Consumes Query| Consumes Ref  |
15+----------------+----------------+----------------------------------------------+---------------+---------------+
16| M              | Match          | No insertion or deletions, bases may differ  | ✓             | ✓             |
17| I              | Insertion      | Additional base in query (not in reference)  | ✓             | ✗             |
18| D              | Deletion       | Query is missing base from reference         | ✗             | ✓             |
19| =              | Equal          | No insertions or deletions, bases agree      | ✓             | ✓             |
20| X              | Not Equal      | No insertions or deletions, bases differ     | ✓             | ✓             |
21| N              | None           | No query bases to align (spliced read)       | ✗             | ✓             |
22| S              | Soft-Clipped   | Bases on end of read not aligned but stored  | ✓             | ✗             |
23| H              | Hard-Clipped   | Bases on end of read not aligned, not stored | ✗             | ✗             |
24| P              | Padding        | Neither read nor reference has a base        | ✗             | ✗             |
25+----------------+----------------+----------------------------------------------+---------------+---------------+
26```
27
28*/
29
30use itertools::Itertools;
31use serde::{Deserialize, Serialize};
32use std::fmt::Write;
33
34use crate::*;
35
36/// A single cigar string character.
37#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
38pub enum CigarOp {
39    /// `=`
40    Match,
41    /// `X`
42    Sub,
43    /// `D`
44    Del,
45    /// `I`
46    Ins,
47}
48
49/// A cigar string character with the corresponding characters from text and pattern.
50#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
51pub enum CigarOpChars {
52    /// The text character of the matching pair.
53    Match(u8),
54    /// The mismatching text and pattern character, in this order.
55    Sub(u8, u8),
56    /// The text character that is missing from the pattern.
57    Del(u8),
58    /// The pattern character that is missing from the text.
59    Ins(u8),
60}
61
62/// A single repeated cigar element, e.g. `5=` or `3I`.
63#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)]
64pub struct CigarElem {
65    pub op: CigarOp,
66    pub cnt: I,
67}
68
69impl CigarElem {
70    pub fn new(op: CigarOp, cnt: I) -> Self {
71        Self { op, cnt }
72    }
73}
74
75/// Main type representing a Cigar string.
76///
77/// Adjacent equal operations are typically merged into [`CigarElem`] with `cnt>1`.
78// This is similar to https://docs.rs/bio/1.0.0/bio/alignment/struct.Alignment.html,
79// but more specific for our use case.
80#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Default)]
81pub struct Cigar {
82    pub ops: Vec<CigarElem>,
83}
84
85impl CigarOp {
86    /// Convert to one of `=XID`.
87    pub fn to_char(&self) -> char {
88        match self {
89            CigarOp::Match => '=',
90            CigarOp::Sub => 'X',
91            CigarOp::Ins => 'I',
92            CigarOp::Del => 'D',
93        }
94    }
95
96    /// Convert operation to `(text_pos, pattern_pos)` delta/step of path indices.
97    #[inline(always)]
98    pub fn delta(&self) -> Pos {
99        match self {
100            CigarOp::Match | CigarOp::Sub => Pos(1, 1),
101            CigarOp::Del => Pos(1, 0), // deletion: consumes ref/text only
102            CigarOp::Ins => Pos(0, 1), // insertion: consumes pattern/query only
103        }
104    }
105
106    /// The edit-distance cost of the operation.
107    ///
108    /// 0 for `Match`, 1 otherwise.
109    #[inline(always)]
110    pub fn edit_cost(&self) -> Cost {
111        match self {
112            CigarOp::Match => 0,
113            _ => 1,
114        }
115    }
116
117    /// Converts path delta `(text_pos, pattern_pos)` to `CigarOp`.
118    ///
119    /// Ignores [`CigarOp::Sub`]: (1,1) is always [`CigarOp::Match`].
120    #[inline(always)]
121    pub fn from_delta(delta: Pos) -> Self {
122        match delta {
123            Pos(0, 1) => CigarOp::Ins,
124            Pos(1, 0) => CigarOp::Del,
125            Pos(1, 1) => CigarOp::Match,
126            _ => panic!("Invalid delta: {:?}", delta),
127        }
128    }
129}
130
131// Mostly to make verify/resolve matches clearer while still using the delta mapping
132impl std::ops::Mul<I> for Pos {
133    type Output = Pos;
134    fn mul(self, rhs: I) -> Pos {
135        Pos(self.0 * rhs, self.1 * rhs)
136    }
137}
138
139impl From<u8> for CigarOp {
140    /// Convert from `=MXID` to `CigarOp`.
141    ///
142    /// `M` is interpreted as `=` ([`CigarOp::Match`]) and should be resolved into `=` or `X` ([`CigarOp::Sub`]) via `resolve_matches`.
143    fn from(op: u8) -> Self {
144        match op {
145            b'=' | b'M' => CigarOp::Match,
146            b'X' => CigarOp::Sub,
147            b'I' => CigarOp::Ins,
148            b'D' => CigarOp::Del,
149            _ => panic!("Invalid CigarOp"),
150        }
151    }
152}
153
154impl ToString for Cigar {
155    /// Format the cigar to a `String`.
156    fn to_string(&self) -> String {
157        let mut s = String::new();
158        for elem in &self.ops {
159            write!(&mut s, "{}{}", elem.cnt, elem.op.to_char()).unwrap();
160        }
161        s
162    }
163}
164
165impl Cigar {
166    /// Concatenate the single-characters ops.
167    pub fn from_ops(ops: impl Iterator<Item = CigarOp>) -> Self {
168        Cigar {
169            ops: ops
170                .chunk_by(|&op| op)
171                .into_iter()
172                .map(|(op, group)| CigarElem::new(op, group.count() as _))
173                .collect(),
174        }
175    }
176
177    /// Create Cigar from path and corresponding sequences.
178    ///
179    /// Path must have `(text_pos, pattern_pos)` pairs.
180    /// To distinguish between match and sub it uses simple
181    /// equality (i.e. c1==c2) via `resolve_matches`, so IUPAC matching is not supported.
182    pub fn from_path(text: Seq, pattern: Seq, path: &Path) -> Cigar {
183        if path[0] != Pos(0, 0) {
184            panic!("Path must start at (0,0)!");
185        }
186        Self::resolve_matches(
187            path.iter()
188                .tuple_windows()
189                .map(|(&text_pos, &pattern_pos)| {
190                    CigarElem::new(CigarOp::from_delta(pattern_pos - text_pos), 1)
191                }),
192            text,
193            pattern,
194        )
195    }
196
197    /// Return the diff from pattern to text.
198    pub fn to_char_pairs<'s>(&'s self, text: &'s [u8], pattern: &'s [u8]) -> Vec<CigarOpChars> {
199        let mut pos = Pos(0, 0);
200        let fix_case = !(b'A' ^ b'a');
201        let mut out = vec![];
202        for el in &self.ops {
203            for _ in 0..el.cnt {
204                let c = match el.op {
205                    CigarOp::Match => {
206                        // NOTE: IUPAC characters can be matching even when they're not equal.
207                        // assert_eq!(
208                        //     (pattern[pos.0 as usize] & fix_case) as char,
209                        //     (text[pos.1 as usize] & fix_case) as char,
210                        //     "mismatch for {pos:?}"
211                        // );
212                        CigarOpChars::Match(text[pos.0 as usize])
213                    }
214                    CigarOp::Sub => {
215                        assert_ne!(
216                            (text[pos.0 as usize] & fix_case) as char,
217                            (pattern[pos.1 as usize] & fix_case) as char,
218                            "cigar {:?}\npattern {:?}\ntext    {:?}\nmismatch for {pos:?}",
219                            self.to_string(),
220                            String::from_utf8_lossy(pattern),
221                            String::from_utf8_lossy(text)
222                        );
223                        CigarOpChars::Sub(text[pos.0 as usize], pattern[pos.1 as usize])
224                    }
225                    CigarOp::Del => {
226                        // Note deletion consumes text hence text slice
227                        CigarOpChars::Del(text[pos.0 as usize])
228                    }
229                    CigarOp::Ins => {
230                        // Note insertion consumes pattern hence pattern slice
231                        CigarOpChars::Ins(pattern[pos.1 as usize])
232                    }
233                };
234                out.push(c);
235                pos += el.op.delta();
236            }
237        }
238        out
239    }
240
241    /// Get the `Path` corresponding to this [`Cigar`].
242    pub fn to_path(&self) -> Path {
243        let mut pos = Pos(0, 0);
244        let mut path = vec![pos];
245        for el in &self.ops {
246            for _ in 0..el.cnt {
247                pos += el.op.delta();
248                path.push(pos);
249            }
250        }
251        path
252    }
253
254    /// Get the `Path` and the alignment `Cost` to each position.
255    pub fn to_path_with_costs(&self, cm: CostModel) -> Vec<(Pos, Cost)> {
256        let mut pos = Pos(0, 0);
257        let mut cost = 0;
258        let mut path = vec![(pos, cost)];
259
260        for el in &self.ops {
261            match el.op {
262                CigarOp::Match => {
263                    for _ in 0..el.cnt {
264                        pos += el.op.delta();
265                        path.push((pos, cost));
266                    }
267                }
268                CigarOp::Sub => {
269                    for _ in 0..el.cnt {
270                        pos += el.op.delta();
271                        cost += cm.sub;
272                        path.push((pos, cost));
273                    }
274                }
275                CigarOp::Ins => {
276                    for len in 1..=(el.cnt as Cost) {
277                        pos += el.op.delta();
278                        path.push((pos, cost + cm.ins(len)));
279                    }
280                    cost += cm.ins(el.cnt);
281                }
282                CigarOp::Del => {
283                    for len in 1..=(el.cnt as Cost) {
284                        pos += el.op.delta();
285                        path.push((pos, cost + cm.del(len)));
286                    }
287                    cost += cm.del(el.cnt);
288                }
289            }
290        }
291        path
292    }
293
294    /// Push a [`CigarOp`] to the cigar.
295    pub fn push(&mut self, op: CigarOp) {
296        if let Some(s) = self.ops.last_mut() {
297            if s.op == op {
298                s.cnt += 1;
299                return;
300            }
301        }
302        self.ops.push(CigarElem { op, cnt: 1 });
303    }
304
305    /// Pop a single [`CigarOp`] from the cigar.
306    pub fn pop_op(&mut self) -> Option<CigarOp> {
307        while let Some(elem) = self.ops.last_mut() {
308            let op = elem.op;
309            assert!(elem.cnt > 0);
310            elem.cnt -= 1;
311            if elem.cnt == 0 {
312                self.ops.pop();
313            }
314            return Some(op);
315        }
316        None
317    }
318
319    /// Push a [`CigarElem`] to the cigar.
320    pub fn push_elem(&mut self, e: CigarElem) {
321        if let Some(s) = self.ops.last_mut() {
322            if s.op == e.op {
323                s.cnt += e.cnt;
324                return;
325            }
326        }
327        self.ops.push(e);
328    }
329
330    /// Push `cnt` matches.
331    pub fn push_matches(&mut self, cnt: I) {
332        if let Some(s) = self.ops.last_mut() {
333            if s.op == CigarOp::Match {
334                s.cnt += cnt;
335                return;
336            }
337        }
338        self.ops.push(CigarElem {
339            op: CigarOp::Match,
340            cnt: cnt as _,
341        });
342    }
343
344    /// Check that the cigar is valid between `text` and `pattern` and return the cost.
345    pub fn verify(&self, cm: &CostModel, text: Seq, pattern: Seq) -> Result<Cost, &str> {
346        let mut pos = Pos(0, 0);
347        let mut cost: Cost = 0;
348
349        for &CigarElem { op, cnt } in &self.ops {
350            match op {
351                CigarOp::Match => {
352                    for _ in 0..cnt {
353                        if text.get(pos.0 as usize) != pattern.get(pos.1 as usize) {
354                            return Err("Expected match but found substitution.");
355                        }
356                        pos += op.delta();
357                    }
358                }
359                CigarOp::Sub => {
360                    for _ in 0..cnt {
361                        if text.get(pos.0 as usize) == pattern.get(pos.1 as usize) {
362                            return Err("Expected substitution but found match.");
363                        }
364                        pos += op.delta();
365                        cost += cm.sub;
366                    }
367                }
368                CigarOp::Ins => {
369                    cost += cm.open + cnt as Cost * cm.extend;
370                    pos += op.delta() * cnt;
371                }
372                CigarOp::Del => {
373                    cost += cm.open + cnt as Cost * cm.extend;
374                    pos += op.delta() * cnt;
375                }
376            }
377        }
378        if pos != Pos(text.len() as I, pattern.len() as I) {
379            return Err("Wrong alignment length.");
380        }
381
382        Ok(cost)
383    }
384
385    /// Splits all 'M'/[`CigarOp::Match`] into matches (`=`) and substitutions (`X`), and joins consecutive equal elements.
386    pub fn resolve_matches(ops: impl Iterator<Item = CigarElem>, text: Seq, pattern: Seq) -> Self {
387        let mut pos = Pos(0, 0);
388        let mut c = Cigar { ops: vec![] };
389        for CigarElem { op, cnt } in ops {
390            match op {
391                CigarOp::Match => {
392                    for _ in 0..cnt {
393                        c.push(if text[pos.0 as usize] == pattern[pos.1 as usize] {
394                            CigarOp::Match
395                        } else {
396                            CigarOp::Sub
397                        });
398                        pos += op.delta();
399                    }
400                    continue;
401                }
402                _ => {
403                    pos += op.delta() * cnt;
404                }
405            };
406            c.push_elem(CigarElem { op, cnt });
407        }
408        c
409    }
410
411    /// A simpler parsing function that only parses strings of characters `M=XID`, without preceding counts.
412    /// Consecutive characters are grouped, and `M` and `=` chars are resolved into `=` and `X`.
413    pub fn parse_without_counts(s: &str, text: Seq, pattern: Seq) -> Self {
414        Self::resolve_matches(
415            s.as_bytes().iter().map(|&op| CigarElem {
416                op: op.into(),
417                cnt: 1,
418            }),
419            text,
420            pattern,
421        )
422    }
423
424    /// A simpler parsing function that only parses strings of characters `MXID`, without preceding counts.
425    /// Consecutive characters are grouped. `M` chars are *not* resolved and assumed to mean `=`.
426    pub fn parse_without_resolving(s: &str) -> Self {
427        let mut c = Cigar { ops: vec![] };
428        for &op in s.as_bytes() {
429            c.push(op.into())
430        }
431        c
432    }
433
434    /// Parse a Cigar string with optional counts
435    pub fn from_string(s: &str) -> Self {
436        let mut c = Cigar { ops: vec![] };
437        for slice in s.as_bytes().split_inclusive(|b| !b.is_ascii_digit()) {
438            let (&op, cnt_bytes) = slice.split_last().expect("Cigar string cannot be empty");
439            let cnt = if cnt_bytes.is_empty() {
440                1
441            } else {
442                unsafe { std::str::from_utf8_unchecked(cnt_bytes) }
443                    .parse()
444                    .expect("Invalid Cigar count")
445            };
446            c.push_elem(CigarElem { op: op.into(), cnt });
447        }
448        c
449    }
450
451    /// A more generic (and slower) parsing function that also allows optional counts, e.g. `5M2X3M`.
452    /// Consecutive characters are grouped, and `M` and `=` chars are resolved into `=` and `X`.
453    pub fn parse(s: &str, text: Seq, pattern: Seq) -> Self {
454        Self::resolve_matches(
455            s.as_bytes()
456                .split_inclusive(|pattern| pattern.is_ascii_alphabetic())
457                .map(|pattern_slice| {
458                    let (&op, cnt) = pattern_slice.split_last().unwrap();
459                    let cnt = if cnt.is_empty() {
460                        1
461                    } else {
462                        unsafe { std::str::from_utf8_unchecked(cnt) }
463                            .parse()
464                            .unwrap()
465                    };
466                    CigarElem { op: op.into(), cnt }
467                }),
468            text,
469            pattern,
470        )
471    }
472
473    /// Clear the internal vector.
474    pub fn clear(&mut self) {
475        self.ops.clear();
476    }
477
478    /// Reverse the cigar string, for `rc(text)` and `rc(pattern)`.
479    pub fn reverse(&mut self) {
480        self.ops.reverse();
481    }
482}
483
484#[cfg(test)]
485mod tests {
486    use super::*;
487
488    #[test]
489    fn test_delta() {
490        for op in [CigarOp::Match, CigarOp::Del, CigarOp::Ins] {
491            assert_eq!(CigarOp::from_delta(op.delta()), op);
492        }
493        assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Match);
494        //  assert_eq!(CigarOp::from_delta(Pos(1, 1)), CigarOp::Sub);
495        assert_eq!(CigarOp::from_delta(Pos(1, 0)), CigarOp::Del); // Consume text
496        assert_eq!(CigarOp::from_delta(Pos(0, 1)), CigarOp::Ins); // Consume pattern
497    }
498
499    #[test]
500    fn test_valid_eq() {
501        let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2)]);
502        assert_eq!(c.to_string(), "1=1X");
503    }
504
505    #[test]
506    fn test_invalid_end_length() {
507        // (0,1)is invalid, text is longer than path
508        let c = Cigar::from_path(b"ab", b"aa", &vec![Pos(0, 0), Pos(0, 1)]);
509        assert!(c.verify(&CostModel::unit(), b"ab", b"aa").is_err());
510    }
511
512    #[test]
513    fn to_string() {
514        let c = Cigar {
515            ops: vec![
516                CigarElem {
517                    op: CigarOp::Ins,
518                    cnt: 1,
519                },
520                CigarElem {
521                    op: CigarOp::Match,
522                    cnt: 2,
523                },
524            ],
525        };
526        assert_eq!(c.to_string(), "1I2=");
527    }
528
529    #[test]
530    fn from_path() {
531        let c = Cigar::from_path(
532            b"aaa",
533            b"aabc",
534            &vec![Pos(0, 0), Pos(1, 1), Pos(2, 2), Pos(3, 3), Pos(3, 4)],
535        );
536        assert_eq!(c.to_string(), "2=1X1I");
537    }
538
539    #[test]
540    fn from_string_with_count() {
541        let c = Cigar::from_string("24=");
542        assert_eq!(c.ops.len(), 1);
543        assert_eq!(c.ops[0], CigarElem::new(CigarOp::Match, 24));
544    }
545
546    #[test]
547    fn from_string_mixed_ops() {
548        let c = Cigar::from_string("2=3I1X");
549        assert_eq!(
550            c.ops,
551            vec![
552                CigarElem::new(CigarOp::Match, 2),
553                CigarElem::new(CigarOp::Ins, 3),
554                CigarElem::new(CigarOp::Sub, 1)
555            ]
556        );
557    }
558
559    #[test]
560    fn from_string_no_counts() {
561        let c = Cigar::from_string("=XIDDD");
562        assert_eq!(
563            c.ops,
564            vec![
565                CigarElem::new(CigarOp::Match, 1),
566                CigarElem::new(CigarOp::Sub, 1),
567                CigarElem::new(CigarOp::Ins, 1),
568                CigarElem::new(CigarOp::Del, 3),
569            ]
570        );
571    }
572
573    #[test]
574    #[rustfmt::skip]
575    fn push_to_path() {
576        let mut c = Cigar::default();
577                                // 0 0
578        c.push(CigarOp::Match); // 1  1
579        c.push(CigarOp::Del);   // 2  1  (text +1)
580        c.push(CigarOp::Ins);   // 2  2  (pattern +1)
581        c.push(CigarOp::Sub);   // 3  3
582
583        assert_eq!(
584            c.to_path(),
585            [
586                Pos(0, 0),
587                Pos(1, 1),
588                Pos(2, 1),
589                Pos(2, 2),
590                Pos(3, 3),
591            ]
592        );
593    }
594
595    #[test]
596    fn to_char_pairs_all_match() {
597        let c = Cigar::from_string("3=");
598        let pairs = c.to_char_pairs(b"aaa", b"aaa");
599        assert_eq!(
600            pairs,
601            vec![
602                CigarOpChars::Match(b'a'),
603                CigarOpChars::Match(b'a'),
604                CigarOpChars::Match(b'a'),
605            ]
606        );
607    }
608
609    #[test]
610    fn to_char_pairs_sub() {
611        let c = Cigar::from_string("1X");
612        let pairs = c.to_char_pairs(b"a", b"c");
613        assert_eq!(pairs, vec![CigarOpChars::Sub(b'a', b'c')]);
614    }
615
616    #[test]
617    fn to_char_pairs_ins() {
618        let c = Cigar::from_string("1=1I1=");
619        let pairs = c.to_char_pairs(b"ac", b"abc");
620        assert_eq!(
621            pairs,
622            vec![
623                CigarOpChars::Match(b'a'),
624                CigarOpChars::Ins(b'b'),
625                CigarOpChars::Match(b'c'),
626            ]
627        );
628    }
629
630    #[test]
631    fn to_char_pairs_del() {
632        let c = Cigar::from_string("1=1D1=");
633        let pairs = c.to_char_pairs(b"abc", b"ac");
634        assert_eq!(
635            pairs,
636            vec![
637                CigarOpChars::Match(b'a'),
638                CigarOpChars::Del(b'b'),
639                CigarOpChars::Match(b'c'),
640            ]
641        );
642    }
643
644    #[test]
645    fn to_char_pairs_mixed() {
646        let c = Cigar::from_string("2=1X1I");
647        let pairs = c.to_char_pairs(b"abZd", b"abYc");
648        assert_eq!(
649            pairs,
650            vec![
651                CigarOpChars::Match(b'a'),
652                CigarOpChars::Match(b'b'),
653                CigarOpChars::Sub(b'Z', b'Y'),
654                CigarOpChars::Ins(b'c'),
655            ]
656        );
657    }
658}