Skip to main content

gbz_base/alignment/
mapping.rs

1//! Basic building blocks of an alignment.
2
3use crate::utils;
4
5use std::ops::Range;
6
7//-----------------------------------------------------------------------------
8
9/// An alignment between a substring of a query sequence and a single node using a single edit operation.
10///
11/// This can also represent unaligned insertions.
12#[derive(Clone, Debug, PartialEq, Eq)]
13pub struct Mapping {
14    // Aligned interval of the query sequence.
15    seq_interval: Range<usize>,
16    // Handle of the target node, or `gbz::ENDMARKER` for no target node.
17    handle: usize,
18    // Length of the target node.
19    node_len: usize,
20    // Aligned interval of the target node.
21    node_interval: Range<usize>,
22    // Edit operation.
23    edit: Difference,
24}
25
26impl Mapping {
27    /// Creates a new mapping starting at the given position.
28    ///
29    /// # Panics
30    ///
31    /// Will panic if the edit extends past the end of the node.
32    /// Will panic if `handle` is [`gbz::ENDMARKER`].
33    pub fn new(seq_offset: usize, handle: usize, node_len: usize, node_offset: usize, edit: Difference) -> Self {
34        assert!(node_offset + edit.target_len() <= node_len, "The edit extends past the end of the node");
35        assert!(handle != gbz::ENDMARKER, "A mapping cannot have {} as the target handle", gbz::ENDMARKER);
36        Self {
37            seq_interval: seq_offset..seq_offset + edit.query_len(),
38            handle,
39            node_len,
40            node_interval: node_offset..node_offset + edit.target_len(),
41            edit,
42        }
43    }
44
45    /// Creates a mapping for an unaligned insertion.
46    ///
47    /// # Panics
48    ///
49    /// Will panic if the edit is not an insertion.
50    pub fn unaligned(seq_offset: usize, edit: Difference) -> Self {
51        assert!(matches!(edit, Difference::Insertion(_)), "An unaligned mapping must have an insertion edit");
52        Self {
53            seq_interval: seq_offset..seq_offset + edit.query_len(),
54            handle: gbz::ENDMARKER,
55            node_len: 0,
56            node_interval: 0..0,
57            edit,
58        }
59    }
60
61    /// Returns the aligned interval of the query sequence.
62    #[inline]
63    pub fn seq_interval(&self) -> &Range<usize> {
64        &self.seq_interval
65    }
66
67    /// Returns the length of the query interval.
68    #[inline]
69    pub fn query_len(&self) -> usize {
70        self.seq_interval.len()
71    }
72
73    /// Returns the handle of the target node.
74    ///
75    /// Returns [`gbz::ENDMARKER`] for an unaligned insertion.
76    #[inline]
77    pub fn handle(&self) -> usize {
78        self.handle
79    }
80
81    /// Returns `true` if this mapping is an unaligned insertion.
82    #[inline]
83    pub fn is_unaligned(&self) -> bool {
84        self.handle == gbz::ENDMARKER
85    }
86
87    /// Returns the length of the target node.
88    #[inline]
89    pub fn node_len(&self) -> usize {
90        self.node_len
91    }
92
93    /// Returns the aligned interval of the target node.
94    #[inline]
95    pub fn node_interval(&self) -> &Range<usize> {
96        &self.node_interval
97    }
98
99    /// Returns the length of the target interval.
100    #[inline]
101    pub fn target_len(&self) -> usize {
102        self.node_interval.len()
103    }
104
105    /// Returns the edit operation.
106    #[inline]
107    pub fn edit(&self) -> &Difference {
108        &self.edit
109    }
110
111    /// Returns `true` this mapping follows the given one in the same node.
112    #[inline]
113    pub fn follows(&self, other: &Self) -> bool {
114        self.handle == other.handle && self.node_interval.start == other.node_interval.end
115    }
116
117    /// Returns `true` if this mapping is at the start of the target node.
118    #[inline]
119    pub fn is_at_start(&self) -> bool {
120        self.node_interval.start == 0
121    }
122
123    /// Returns `true` if this mapping is at the end of the target node.
124    #[inline]
125    pub fn is_at_end(&self) -> bool {
126        self.node_interval.end == self.node_len
127    }
128}
129
130//-----------------------------------------------------------------------------
131
132/// An operation in a difference string describing an alignment between a query sequence and a target sequence.
133///
134/// This implementation supports the following operations:
135///
136/// * `=`: A match given as the matching sequence.
137/// * `:`: A match given as the match length.
138/// * `*`: A mismatch given as the target base and the query base.
139/// * `+`: An insertion given as the inserted sequence.
140/// * `-`: A deletion given as the deleted sequence.
141///
142/// The operations do not store target bases, as the query sequence can be reconstructed without that information.
143/// Operation `~` (intron length and splice signal) is not supported yet.
144/// Parsing is based on bytes rather than characters to avoid unnecessary UTF-8 validation.
145///
146/// # Examples
147///
148/// ```
149/// use gbz_base::Difference;
150///
151/// let with_gaps = b":48-CAT:44+GATTACA:51";
152/// let ops = Difference::parse(with_gaps);
153/// assert!(ops.is_ok());
154/// let ops = ops.unwrap();
155/// assert_eq!(ops.len(), 5);
156/// assert_eq!(ops[0], Difference::Match(48));
157/// assert_eq!(ops[1], Difference::Deletion(3));
158/// assert_eq!(ops[2], Difference::Match(44));
159/// assert_eq!(ops[3], Difference::Insertion(b"GATTACA".to_vec()));
160/// assert_eq!(ops[4], Difference::Match(51));
161/// ```
162#[derive(Clone, Debug, PartialEq, Eq)]
163pub enum Difference {
164    /// A match of the given length.
165    Match(usize),
166    /// Mismatch represented as the query base.
167    Mismatch(u8),
168    /// Insertion to the reference represented as the inserted sequence.
169    Insertion(Vec<u8>),
170    /// Deletion from the reference represented as deletion length.
171    Deletion(usize),
172    /// Marker for the end of a sequence when concatenating multiple difference strings.
173    End,
174}
175
176impl Difference {
177    /// Number of supported operation types.
178    pub const NUM_TYPES: usize = 5;
179
180    /// Symbol used as a substitute for an unknown base.
181    pub const UNKNOWN_BASE: u8 = b'X';
182
183    // TODO: This does not support `~` (intron length and splice signal) yet.
184    const OPS: &'static [u8] = b"=:*+-";
185
186    fn base_to_upper(c: u8) -> u8 {
187        if c.is_ascii_lowercase() {
188            c - b'a' + b'A'
189        } else {
190            c
191        }
192    }
193
194    fn seq_to_upper(seq: &[u8]) -> Vec<u8> {
195        seq.iter().map(|&c| Self::base_to_upper(c)).collect()
196    }
197
198    fn matching_sequence(value: &[u8]) -> Option<Self> {
199        Some(Self::Match(value.len()))
200    }
201
202    fn match_length(value: &[u8]) -> Option<Self> {
203        let len = str::from_utf8(value).ok()?;
204        let len = len.parse::<usize>().ok()?;
205        Some(Self::Match(len))
206    }
207
208    fn mismatch(value: &[u8]) -> Option<Self> {
209        if value.len() != 2 {
210            return None;
211        }
212        Some(Self::Mismatch(Self::base_to_upper(value[1])))
213    }
214
215    fn insertion(value: &[u8]) -> Option<Self> {
216        Some(Self::Insertion(Self::seq_to_upper(value)))
217    }
218
219    fn deletion(value: &[u8]) -> Option<Self> {
220        Some(Self::Deletion(value.len()))
221    }
222
223    /// Parses a difference string and returns it as a vector of operations.
224    ///
225    /// Returns an error if the difference string is invalid.
226    pub fn parse(difference_string: &[u8]) -> Result<Vec<Self>, String> {
227        let mut result: Vec<Self> = Vec::new();
228        if difference_string.is_empty() {
229            return Ok(result);
230        }
231        if !Self::OPS.contains(&difference_string[0]) {
232            return Err(format!("Invalid difference string operation: {}", difference_string[0] as char));
233        }
234
235        let mut start = 0;
236        while start < difference_string.len() {
237            let mut end = start + 1;
238            while end < difference_string.len() && !Self::OPS.contains(&difference_string[end]) {
239                end += 1;
240            }
241            let value = &difference_string[start + 1..end];
242            let op = match difference_string[start] {
243                b'=' => Self::matching_sequence(value),
244                b':' => Self::match_length(value),
245                b'*' => Self::mismatch(value),
246                b'+' => Self::insertion(value),
247                b'-' => Self::deletion(value),
248                _ => return Err(format!("Invalid difference string operation: {}", difference_string[start] as char)),
249            }.ok_or_else(|| format!("Invalid difference string field: {}", String::from_utf8_lossy(&difference_string[start..end])))?;
250            result.push(op);
251            start = end;
252        }
253
254        Ok(result)
255    }
256
257    /// Parses a difference string and returns it as a normalized vector of operations.
258    ///
259    /// The operations are merged and empty operations are removed.
260    /// Returns an error if the difference string is invalid.
261    pub fn parse_normalized(difference_string: &[u8]) -> Result<Vec<Self>, String> {
262        let ops = Self::parse(difference_string)?;
263        Ok(Self::normalize(ops))
264    }
265
266    /// Calculates various statistics from a sequence of operations.
267    ///
268    /// The return value is (query length, target length, matches, edits).
269    pub fn stats(difference_string: &[Self]) -> (usize, usize, usize, usize) {
270        let mut query_len = 0;
271        let mut target_len = 0;
272        let mut matches = 0;
273        let mut edits = 0;
274        for diff in difference_string.iter() {
275            match diff {
276                Self::Match(len) => {
277                    query_len += len;
278                    target_len += len;
279                    matches += len;
280                },
281                Self::Mismatch(_) => {
282                    query_len += 1;
283                    target_len += 1;
284                    edits += 1;
285                }
286                Self::Insertion(seq) => {
287                    query_len += seq.len();
288                    edits += seq.len();
289                }
290                Self::Deletion(len) => {
291                    target_len += len;
292                    edits += len;
293                },
294                Self::End => {},
295            }
296        }
297        (query_len, target_len, matches, edits)
298    }
299
300    /// Returns the length of the operation.
301    pub fn len(&self) -> usize {
302        match self {
303            Self::Match(len) => *len,
304            Self::Mismatch(_) => 1,
305            Self::Insertion(seq) => seq.len(),
306            Self::Deletion(len) => *len,
307            Self::End => 0,
308        }
309    }
310
311    /// Returns `true` if the operation is empty.
312    pub fn is_empty(&self) -> bool {
313        self.len() == 0
314    }
315
316    /// Returns the length of the operation in the target sequence.
317    pub fn target_len(&self) -> usize {
318        match self {
319            Self::Match(len) => *len,
320            Self::Mismatch(_) => 1,
321            Self::Insertion(_) => 0,
322            Self::Deletion(len) => *len,
323            Self::End => 0,
324        }
325    }
326
327    /// Returns the length of the operation in the query sequence.
328    pub fn query_len(&self) -> usize {
329        match self {
330            Self::Match(len) => *len,
331            Self::Mismatch(_) => 1,
332            Self::Insertion(seq) => seq.len(),
333            Self::Deletion(_) => 0,
334            Self::End => 0,
335        }
336    }
337
338    /// Merges the given operation into this operation if they can be merged.
339    ///
340    /// Returns `true` if the operations were merged.
341    pub fn try_merge(&mut self, op: &Self) -> bool {
342        match (self, op) {
343            (Self::Match(len1), Self::Match(len2)) => {
344                *len1 += len2;
345                true
346            },
347            (Self::Insertion(seq1), Self::Insertion(seq2)) => {
348                seq1.extend_from_slice(seq2);
349                true
350            },
351            (Self::Deletion(len1), Self::Deletion(len2)) => {
352                *len1 += len2;
353                true
354            },
355            _ => false,
356        }
357    }
358
359    /// Normalizes the sequence of operations.
360    ///
361    /// This merges adjacent matches and insertions and removes empty operations.
362    pub fn normalize(ops: Vec<Self>) -> Vec<Self> {
363        let mut result = ops;
364        let mut tail = 0;
365        for i in 0..result.len() {
366            if result[i].is_empty() {
367                continue;
368            }
369            if tail > 0 {
370                // We need to be careful around the borrow checker.
371                let (left, right) = result.split_at_mut(i);
372                if left[tail - 1].try_merge(&right[0]) {
373                    continue;
374                }
375            }
376            result.swap(tail, i);
377            tail += 1;
378        }
379
380        result.truncate(tail);
381        result
382    }
383
384    /// Writes a difference string as a `Vec<u8>` string.
385    pub fn to_bytes(ops: &[Difference], target_sequence: &[u8]) -> Vec<u8> {
386        let mut result = Vec::new();
387        let mut target_offset = 0;
388        for op in ops.iter() {
389            match op {
390                Self::Match(len) => {
391                    result.push(b':');
392                    utils::append_usize(&mut result, *len);
393                    target_offset += *len;
394                },
395                Self::Mismatch(base) => {
396                    result.push(b'*');
397                    result.push(target_sequence[target_offset]);
398                    result.push(*base);
399                    target_offset += 1;
400                },
401                Self::Insertion(seq) => {
402                    result.push(b'+');
403                    result.extend_from_slice(seq);
404                },
405                Self::Deletion(len) => {
406                    result.push(b'-');
407                    result.extend_from_slice(&target_sequence[target_offset..target_offset + *len]);
408                    target_offset += *len;
409                },
410                Self::End => {},
411            }
412        }
413        result
414    }
415}
416
417//-----------------------------------------------------------------------------
418
419#[cfg(test)]
420mod tests {
421
422use super::*;
423
424// Tests for `Difference`.
425
426fn check_difference(seq: &[u8], truth: &[Difference], name: &str, normalize: bool) {
427    let result = Difference::parse(seq);
428    assert!(result.is_ok(), "Failed to parse the difference string for {}: {}", name, result.err().unwrap());
429    let mut result = result.unwrap();
430    if normalize {
431        result = Difference::normalize(result);
432    }
433    assert_eq!(result.len(), truth.len(), "Wrong number of operations for {}", name);
434    for i in 0..truth.len() {
435        assert_eq!(result[i], truth[i], "Wrong operation at position {} for {}", i, name);
436    }
437}
438
439fn check_to_bytes(ops: &[Difference], truth: &[u8], target_sequence: &[u8], name: &str) {
440    let result = Difference::to_bytes(ops, target_sequence);
441    assert_eq!(result, truth, "Wrong byte representation for {}", name);
442}
443
444#[test]
445fn difference_empty() {
446    let name = "empty";
447    let seq = b"";
448    let truth = [];
449    check_difference(seq, &truth, name, false);
450    let target_sequence = b"";
451    check_to_bytes(&truth, seq, target_sequence, name);
452}
453
454#[test]
455fn difference_single() {
456    {
457        let name = "matching sequence";
458        let seq = b"=ACGT";
459        let truth = [ Difference::Match(4) ];
460        check_difference(seq, &truth, name, false);
461        let expected = b":4"; // Converted to match length.
462        let target_sequence = b"ACGT";
463        check_to_bytes(&truth, expected, target_sequence, name);
464    }
465    {
466        let name = "match length";
467        let seq = b":5";
468        let truth = [ Difference::Match(5) ];
469        check_difference(seq, &truth, name, false);
470        let target_sequence = b"GATTA";
471        check_to_bytes(&truth, seq, target_sequence, name);
472    }
473    {
474        let name = "mismatch";
475        let seq = b"*AC";
476        let truth = [ Difference::Mismatch(b'C') ];
477        check_difference(seq, &truth, name, false);
478        let target_sequence = b"A";
479        check_to_bytes(&truth, seq, target_sequence, name);
480    }
481    {
482        let name = "insertion";
483        let seq = b"+ACGT";
484        let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
485        check_difference(seq, &truth, name, false);
486        let target_sequence = b"";
487        check_to_bytes(&truth, seq, target_sequence, name);
488    }
489    {
490        let name = "deletion";
491        let seq = b"-ACGT";
492        let truth = [ Difference::Deletion(4) ];
493        check_difference(seq, &truth, name, false);
494        let target_sequence = b"ACGT";
495        check_to_bytes(&truth, seq, target_sequence, name);
496    }
497}
498
499#[test]
500fn difference_multi() {
501    // All of these are from real reads mapped with Giraffe.
502    // Target sequences only contain the relevant bases.
503    {
504        let name = "ERR3239454.129477191 fragment 2";
505        let seq = b":24*CG:78-C:35*TG:2*TG:1*TG:6";
506        let truth = [
507            Difference::Match(24),
508            Difference::Mismatch(b'G'),
509            Difference::Match(78),
510            Difference::Deletion(1),
511            Difference::Match(35),
512            Difference::Mismatch(b'G'),
513            Difference::Match(2),
514            Difference::Mismatch(b'G'),
515            Difference::Match(1),
516            Difference::Mismatch(b'G'),
517            Difference::Match(6)
518        ];
519        check_difference(seq, &truth, name, false);
520        let mut target_sequence = vec![b'-'; 151];
521        target_sequence[24] = b'C';
522        target_sequence[103] = b'C';
523        target_sequence[139] = b'T';
524        target_sequence[142] = b'T';
525        target_sequence[144] = b'T';
526        check_to_bytes(&truth, seq, &target_sequence, name);
527    }
528    {
529        let name = "ERR3239454.11251898 fragment 1";
530        let seq = b":129+TTGAGGGGGTATAAGAGGTCG";
531        let truth = [
532            Difference::Match(129),
533            Difference::Insertion(b"TTGAGGGGGTATAAGAGGTCG".to_vec())
534        ];
535        check_difference(seq, &truth, name, false);
536        let target_sequence = vec![b'-', 129];
537        check_to_bytes(&truth, seq, &target_sequence, name);
538    }
539    {
540        let name = "ERR3239454.97848632 fragment 1";
541        let seq = b":111*GT:20*CA:6*GT:2*GT:3*GT:3";
542        let truth = [
543            Difference::Match(111),
544            Difference::Mismatch(b'T'),
545            Difference::Match(20),
546            Difference::Mismatch(b'A'),
547            Difference::Match(6),
548            Difference::Mismatch(b'T'),
549            Difference::Match(2),
550            Difference::Mismatch(b'T'),
551            Difference::Match(3),
552            Difference::Mismatch(b'T'),
553            Difference::Match(3)
554        ];
555        check_difference(seq, &truth, name, false);
556        let mut target_sequence = vec![b'-'; 150];
557        target_sequence[111] = b'G';
558        target_sequence[132] = b'C';
559        target_sequence[139] = b'G';
560        target_sequence[142] = b'G';
561        target_sequence[146] = b'G';
562        check_to_bytes(&truth, seq, &target_sequence, name);
563    }
564}
565
566#[test]
567fn difference_case() {
568    {
569        // Lower case mismatch.
570        let seq = b"*ac";
571        let truth = [ Difference::Mismatch(b'C') ];
572        check_difference(seq, &truth, "lower case mismatch", false);
573    }
574    {
575        // Lower case insertion.
576        let seq = b"+acgt";
577        let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
578        check_difference(seq, &truth, "lower case insertion", false);
579    }
580}
581
582fn invalid_difference(seq: &[u8], name: &str) {
583    let result = Difference::parse(seq);
584    assert!(result.is_err(), "Parsed an invalid difference string: {}", name);
585}
586
587#[test]
588fn difference_invalid() {
589    invalid_difference(b"ABC", "invalid operation at start");
590    invalid_difference(b"+AC:", "empty operation at end");
591    invalid_difference(b"~AC30AC", "intron / splice operation");
592    invalid_difference(b":123A", "match length is not an integer");
593    invalid_difference(b"+GATTACA:", "empty match length");
594    invalid_difference(b"*ACT", "mismatch is too long");
595    invalid_difference(b"*A", "mismatch is too short");
596}
597
598#[test]
599fn difference_len() {
600    {
601        let diff = Difference::Match(42);
602        assert_eq!(diff.len(), 42, "Wrong length for a match");
603    }
604    {
605        let diff = Difference::Mismatch(b'A');
606        assert_eq!(diff.len(), 1, "Wrong length for a mismatch");
607    }
608    {
609        let diff = Difference::Insertion(b"GATTACA".to_vec());
610        assert_eq!(diff.len(), 7, "Wrong length for an insertion");
611    }
612    {
613        let diff = Difference::Deletion(42);
614        assert_eq!(diff.len(), 42, "Wrong length for a deletion");
615    }
616}
617
618#[test]
619fn difference_normalize() {
620    {
621        let seq = b"=ACGT:4*AC*GT:2:3=ACT*AC-ACGT-ACGT+GATTACA+CAT";
622        let truth = [
623            Difference::Match(8),
624            Difference::Mismatch(b'C'),
625            Difference::Mismatch(b'T'),
626            Difference::Match(8),
627            Difference::Mismatch(b'C'),
628            Difference::Deletion(8),
629            Difference::Insertion(b"GATTACACAT".to_vec()),
630        ];
631        check_difference(seq, &truth, "normalized", true);
632    }
633    {
634        let seq = b"=:0+-*AC:30=GATTA+-:0=";
635        let truth = [
636            Difference::Mismatch(b'C'),
637            Difference::Match(35),
638        ];
639        check_difference(seq, &truth, "normalized with empty operations", true);
640    }
641}
642
643}
644
645//-----------------------------------------------------------------------------