Skip to main content

snp_index/read/
refine.rs

1//! Genome-aware read refinement.
2//!
3//! Refinement must be sequence-aware. A mismatch near a splice junction may be
4//! either a true mutation or a mapping artifact. We only rewrite it when the
5//! read base matches the reference base at the proposed corrected position.
6
7use crate::genome::Genome;
8use crate::read::aligned_read::{AlignedRead, ReadOpKind};
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub struct RefineOptions {
12    pub max_diff_before_refskip: u32,
13    pub max_diff_after_refskip: u32,
14    pub merge_adjacent_ops: bool,
15}
16
17impl Default for RefineOptions {
18    fn default() -> Self {
19        Self {
20            max_diff_before_refskip: 1,
21            max_diff_after_refskip: 1,
22            merge_adjacent_ops: true,
23        }
24    }
25}
26
27impl AlignedRead {
28    /// Refine this read against a reference genome.
29    ///
30    /// This is the safe production refinement entry point.
31    pub fn refine_against_genome(&mut self, genome: &Genome, options: RefineOptions) {
32        let pairs = self.op_pairs();
33
34        let pairs = self.absorb_diff_before_refskip_against_genome(&pairs, genome, options);
35        let pairs = self.absorb_diff_after_refskip_against_genome(&pairs, genome, options);
36
37        let pairs = if options.merge_adjacent_ops {
38            Self::merge_adjacent_op_pairs(&pairs)
39        } else {
40            pairs
41        };
42
43        self.replace_ops(pairs);
44    }
45
46    /// Convenience wrapper using default options.
47    pub fn refine_against_genome_default(&mut self, genome: &Genome) {
48        self.refine_against_genome(genome, RefineOptions::default());
49    }
50
51    /// Absorb `M X N M` into `M N M` only if the X base matches the first
52    /// reference base after the skipped region.
53    pub fn absorb_diff_before_refskip_against_genome(
54        &self,
55        pairs: &[(ReadOpKind, u32)],
56        genome: &Genome,
57        options: RefineOptions,
58    ) -> Vec<(ReadOpKind, u32)> {
59        if options.max_diff_before_refskip == 0 || pairs.len() < 4 {
60            return pairs.to_vec();
61        }
62
63        let ops = Self::build_positioned_ops(self.ref_start0, pairs);
64        let mut out = Vec::with_capacity(pairs.len());
65        let mut i = 0usize;
66
67        while i < ops.len() {
68            if i + 3 < ops.len()
69                && Self::is_aligned_kind(ops[i].kind)
70                && ops[i + 1].kind == ReadOpKind::Diff
71                && ops[i + 1].len <= options.max_diff_before_refskip
72                && ops[i + 2].kind == ReadOpKind::RefSkip
73                && Self::is_aligned_kind(ops[i + 3].kind)
74                && self.diff_matches_reference_at_target(genome, &ops[i + 1], ops[i + 3].ref_start0)
75            {
76                out.push((ops[i].kind, ops[i].len));
77                out.push((ops[i + 2].kind, ops[i + 2].len));
78                out.push((ops[i + 3].kind, ops[i + 3].len + ops[i + 1].len));
79                i += 4;
80            } else {
81                out.push((ops[i].kind, ops[i].len));
82                i += 1;
83            }
84        }
85
86        out
87    }
88
89    /// Absorb `M N X M` into `M N M` only if the X base matches the last
90    /// reference base before the skipped region.
91    pub fn absorb_diff_after_refskip_against_genome(
92        &self,
93        pairs: &[(ReadOpKind, u32)],
94        genome: &Genome,
95        options: RefineOptions,
96    ) -> Vec<(ReadOpKind, u32)> {
97        if options.max_diff_after_refskip == 0 || pairs.len() < 4 {
98            return pairs.to_vec();
99        }
100
101        let ops = Self::build_positioned_ops(self.ref_start0, pairs);
102        let mut out = Vec::with_capacity(pairs.len());
103        let mut i = 0usize;
104
105        while i < ops.len() {
106            if i + 3 < ops.len()
107                && Self::is_aligned_kind(ops[i].kind)
108                && ops[i + 1].kind == ReadOpKind::RefSkip
109                && ops[i + 2].kind == ReadOpKind::Diff
110                && ops[i + 2].len <= options.max_diff_after_refskip
111                && Self::is_aligned_kind(ops[i + 3].kind)
112                && self.diff_matches_reference_at_target(genome, &ops[i + 2], ops[i].ref_end0())
113            {
114                out.push((ops[i].kind, ops[i].len + ops[i + 2].len));
115                out.push((ops[i + 1].kind, ops[i + 1].len));
116                out.push((ops[i + 3].kind, ops[i + 3].len));
117                i += 4;
118            } else {
119                out.push((ops[i].kind, ops[i].len));
120                i += 1;
121            }
122        }
123
124        out
125    }
126
127    /// Check whether every read base in a Diff operation matches the reference
128    /// at the proposed corrected target position.
129    pub fn diff_matches_reference_at_target(
130        &self,
131        genome: &Genome,
132        diff_op: &crate::read::aligned_read::ReadOp,
133        target_ref_start0: u32,
134    ) -> bool {
135        for offset in 0..diff_op.len {
136            let read_pos = diff_op.read_start + offset;
137            let target_pos = target_ref_start0 + offset;
138
139            let Some(read_base) = self.seq.get(read_pos as usize).copied() else {
140                return false;
141            };
142
143            if !genome.base_matches(self.chr_id, target_pos, read_base) {
144                return false;
145            }
146        }
147
148        true
149    }
150
151    pub fn merge_adjacent_op_pairs(pairs: &[(ReadOpKind, u32)]) -> Vec<(ReadOpKind, u32)> {
152        let mut out: Vec<(ReadOpKind, u32)> = Vec::with_capacity(pairs.len());
153
154        for pair in pairs {
155            if pair.1 == 0 {
156                continue;
157            }
158
159            match out.last_mut() {
160                Some(last) if last.0 == pair.0 => last.1 += pair.1,
161                _ => out.push(*pair),
162            }
163        }
164
165        out
166    }
167
168    pub fn is_aligned_kind(kind: ReadOpKind) -> bool {
169        matches!(
170            kind,
171            ReadOpKind::Match | ReadOpKind::Equal | ReadOpKind::Diff
172        )
173    }
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179    use crate::read::aligned_read::Strand;
180
181    impl Genome {
182        fn synthetic_splice_genome() -> Self {
183            // Coordinates:
184            // 100..110 = first exon, C
185            // 110..210 = intron, A
186            // 210..226 = second exon, G
187            let mut seq = vec![b'N'; 300];
188
189            for b in &mut seq[100..110] {
190                *b = b'C';
191            }
192            for b in &mut seq[110..210] {
193                *b = b'A';
194            }
195            for b in &mut seq[210..226] {
196                *b = b'G';
197            }
198
199            Genome::new(vec![("chr1".to_string(), seq)]).unwrap()
200        }
201    }
202
203    impl AlignedRead {
204        fn make_read(seq: &[u8], pairs: Vec<(ReadOpKind, u32)>) -> Self {
205            Self::new(
206                0,
207                Strand::Plus,
208                100,
209                seq.to_vec(),
210                Some(vec![30; seq.len()]),
211                pairs,
212            )
213        }
214    }
215
216    #[test]
217    fn real_internal_mutation_is_not_refined_without_refskip() {
218        let genome = Genome::synthetic_splice_genome();
219
220        let mut read = AlignedRead::make_read(
221            b"CCCCCCCCCCTCCCC",
222            vec![
223                (ReadOpKind::Match, 10),
224                (ReadOpKind::Diff, 1),
225                (ReadOpKind::Match, 4),
226            ],
227        );
228
229        read.refine_against_genome(&genome, RefineOptions::default());
230
231        assert_eq!(
232            read.op_pairs(),
233            vec![
234                (ReadOpKind::Match, 10),
235                (ReadOpKind::Diff, 1),
236                (ReadOpKind::Match, 4),
237            ]
238        );
239
240        let obs = read.base_at_ref_pos(110).unwrap();
241        assert_eq!(obs.base, b'T');
242        assert_eq!(obs.read_pos, 10);
243    }
244
245    #[test]
246    fn diff_before_refskip_is_preserved_if_it_is_real_mutation_not_second_exon_base() {
247        let genome = Genome::synthetic_splice_genome();
248
249        let mut read = AlignedRead::make_read(
250            b"CCCCCCCCCCTGGGGGGGGGGGGGGG",
251            vec![
252                (ReadOpKind::Match, 10),
253                (ReadOpKind::Diff, 1),
254                (ReadOpKind::RefSkip, 100),
255                (ReadOpKind::Match, 15),
256            ],
257        );
258
259        read.refine_against_genome(&genome, RefineOptions::default());
260
261        assert_eq!(
262            read.op_pairs(),
263            vec![
264                (ReadOpKind::Match, 10),
265                (ReadOpKind::Diff, 1),
266                (ReadOpKind::RefSkip, 100),
267                (ReadOpKind::Match, 15),
268            ]
269        );
270
271        let mutation = read.base_at_ref_pos(110).unwrap();
272        assert_eq!(mutation.base, b'T');
273        assert_eq!(mutation.read_pos, 10);
274    }
275
276    #[test]
277    fn diff_after_refskip_is_preserved_if_it_is_not_left_exon_reference() {
278        let genome = Genome::synthetic_splice_genome();
279
280        let mut read = AlignedRead::make_read(
281            b"CCCCCCCCCCTGGGGGGGGGGGGGGG",
282            vec![
283                (ReadOpKind::Match, 10),
284                (ReadOpKind::RefSkip, 100),
285                (ReadOpKind::Diff, 1),
286                (ReadOpKind::Match, 15),
287            ],
288        );
289
290        read.refine_against_genome(&genome, RefineOptions::default());
291
292        assert_eq!(
293            read.op_pairs(),
294            vec![
295                (ReadOpKind::Match, 10),
296                (ReadOpKind::RefSkip, 100),
297                (ReadOpKind::Diff, 1),
298                (ReadOpKind::Match, 15),
299            ]
300        );
301
302        let mutation = read.base_at_ref_pos(210).unwrap();
303        assert_eq!(mutation.base, b'T');
304        assert_eq!(mutation.read_pos, 10);
305    }
306
307    #[test]
308    fn larger_diff_is_not_refined_by_default() {
309        let genome = Genome::synthetic_splice_genome();
310
311        let mut read = AlignedRead::make_read(
312            b"CCCCCCCCCCGGGGGGGGGGGGGGGGG",
313            vec![
314                (ReadOpKind::Match, 10),
315                (ReadOpKind::Diff, 2),
316                (ReadOpKind::RefSkip, 100),
317                (ReadOpKind::Match, 15),
318            ],
319        );
320
321        read.refine_against_genome(&genome, RefineOptions::default());
322
323        assert_eq!(
324            read.op_pairs(),
325            vec![
326                (ReadOpKind::Match, 10),
327                (ReadOpKind::Diff, 2),
328                (ReadOpKind::RefSkip, 100),
329                (ReadOpKind::Match, 15),
330            ]
331        );
332    }
333
334    #[test]
335    fn larger_diff_can_be_refined_when_configured_and_reference_matches() {
336        let genome = Genome::synthetic_splice_genome();
337
338        let mut read = AlignedRead::make_read(
339            b"CCCCCCCCCCGGGGGGGGGGGGGGGGG",
340            vec![
341                (ReadOpKind::Match, 10),
342                (ReadOpKind::Diff, 2),
343                (ReadOpKind::RefSkip, 100),
344                (ReadOpKind::Match, 15),
345            ],
346        );
347
348        read.refine_against_genome(
349            &genome,
350            RefineOptions {
351                max_diff_before_refskip: 2,
352                max_diff_after_refskip: 1,
353                merge_adjacent_ops: true,
354            },
355        );
356
357        assert_eq!(
358            read.op_pairs(),
359            vec![
360                (ReadOpKind::Match, 10),
361                (ReadOpKind::RefSkip, 100),
362                (ReadOpKind::Match, 17),
363            ]
364        );
365
366        assert_eq!(read.base_at_ref_pos(210).unwrap().base, b'G');
367        assert_eq!(read.base_at_ref_pos(211).unwrap().base, b'G');
368    }
369
370    #[test]
371    fn merge_adjacent_op_pairs_collapses_neighbors_and_removes_zero_length_ops() {
372        let pairs = vec![
373            (ReadOpKind::Match, 5),
374            (ReadOpKind::Match, 3),
375            (ReadOpKind::Diff, 0),
376            (ReadOpKind::RefSkip, 100),
377            (ReadOpKind::Match, 2),
378            (ReadOpKind::Match, 4),
379        ];
380
381        let merged = AlignedRead::merge_adjacent_op_pairs(&pairs);
382
383        assert_eq!(
384            merged,
385            vec![
386                (ReadOpKind::Match, 8),
387                (ReadOpKind::RefSkip, 100),
388                (ReadOpKind::Match, 6),
389            ]
390        );
391    }
392}