1use crate::genome::Genome;
8use crate::read::aligned_read::{AlignedRead, ReadOpKind};
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub struct RefineOptions {
12 pub max_diff_before_refskip: u32,
13 pub max_diff_after_refskip: u32,
14 pub merge_adjacent_ops: bool,
15}
16
17impl Default for RefineOptions {
18 fn default() -> Self {
19 Self {
20 max_diff_before_refskip: 1,
21 max_diff_after_refskip: 1,
22 merge_adjacent_ops: true,
23 }
24 }
25}
26
27impl AlignedRead {
28 pub fn refine_against_genome(&mut self, genome: &Genome, options: RefineOptions) {
32 let pairs = self.op_pairs();
33
34 let pairs = self.absorb_diff_before_refskip_against_genome(&pairs, genome, options);
35 let pairs = self.absorb_diff_after_refskip_against_genome(&pairs, genome, options);
36
37 let pairs = if options.merge_adjacent_ops {
38 Self::merge_adjacent_op_pairs(&pairs)
39 } else {
40 pairs
41 };
42
43 self.replace_ops(pairs);
44 }
45
46 pub fn refine_against_genome_default(&mut self, genome: &Genome) {
48 self.refine_against_genome(genome, RefineOptions::default());
49 }
50
51 pub fn absorb_diff_before_refskip_against_genome(
54 &self,
55 pairs: &[(ReadOpKind, u32)],
56 genome: &Genome,
57 options: RefineOptions,
58 ) -> Vec<(ReadOpKind, u32)> {
59 if options.max_diff_before_refskip == 0 || pairs.len() < 4 {
60 return pairs.to_vec();
61 }
62
63 let ops = Self::build_positioned_ops(self.ref_start0, pairs);
64 let mut out = Vec::with_capacity(pairs.len());
65 let mut i = 0usize;
66
67 while i < ops.len() {
68 if i + 3 < ops.len()
69 && Self::is_aligned_kind(ops[i].kind)
70 && ops[i + 1].kind == ReadOpKind::Diff
71 && ops[i + 1].len <= options.max_diff_before_refskip
72 && ops[i + 2].kind == ReadOpKind::RefSkip
73 && Self::is_aligned_kind(ops[i + 3].kind)
74 && self.diff_matches_reference_at_target(genome, &ops[i + 1], ops[i + 3].ref_start0)
75 {
76 out.push((ops[i].kind, ops[i].len));
77 out.push((ops[i + 2].kind, ops[i + 2].len));
78 out.push((ops[i + 3].kind, ops[i + 3].len + ops[i + 1].len));
79 i += 4;
80 } else {
81 out.push((ops[i].kind, ops[i].len));
82 i += 1;
83 }
84 }
85
86 out
87 }
88
89 pub fn absorb_diff_after_refskip_against_genome(
92 &self,
93 pairs: &[(ReadOpKind, u32)],
94 genome: &Genome,
95 options: RefineOptions,
96 ) -> Vec<(ReadOpKind, u32)> {
97 if options.max_diff_after_refskip == 0 || pairs.len() < 4 {
98 return pairs.to_vec();
99 }
100
101 let ops = Self::build_positioned_ops(self.ref_start0, pairs);
102 let mut out = Vec::with_capacity(pairs.len());
103 let mut i = 0usize;
104
105 while i < ops.len() {
106 if i + 3 < ops.len()
107 && Self::is_aligned_kind(ops[i].kind)
108 && ops[i + 1].kind == ReadOpKind::RefSkip
109 && ops[i + 2].kind == ReadOpKind::Diff
110 && ops[i + 2].len <= options.max_diff_after_refskip
111 && Self::is_aligned_kind(ops[i + 3].kind)
112 && self.diff_matches_reference_at_target(genome, &ops[i + 2], ops[i].ref_end0())
113 {
114 out.push((ops[i].kind, ops[i].len + ops[i + 2].len));
115 out.push((ops[i + 1].kind, ops[i + 1].len));
116 out.push((ops[i + 3].kind, ops[i + 3].len));
117 i += 4;
118 } else {
119 out.push((ops[i].kind, ops[i].len));
120 i += 1;
121 }
122 }
123
124 out
125 }
126
127 pub fn diff_matches_reference_at_target(
130 &self,
131 genome: &Genome,
132 diff_op: &crate::read::aligned_read::ReadOp,
133 target_ref_start0: u32,
134 ) -> bool {
135 for offset in 0..diff_op.len {
136 let read_pos = diff_op.read_start + offset;
137 let target_pos = target_ref_start0 + offset;
138
139 let Some(read_base) = self.seq.get(read_pos as usize).copied() else {
140 return false;
141 };
142
143 if !genome.base_matches(self.chr_id, target_pos, read_base) {
144 return false;
145 }
146 }
147
148 true
149 }
150
151 pub fn merge_adjacent_op_pairs(pairs: &[(ReadOpKind, u32)]) -> Vec<(ReadOpKind, u32)> {
152 let mut out: Vec<(ReadOpKind, u32)> = Vec::with_capacity(pairs.len());
153
154 for pair in pairs {
155 if pair.1 == 0 {
156 continue;
157 }
158
159 match out.last_mut() {
160 Some(last) if last.0 == pair.0 => last.1 += pair.1,
161 _ => out.push(*pair),
162 }
163 }
164
165 out
166 }
167
168 pub fn is_aligned_kind(kind: ReadOpKind) -> bool {
169 matches!(
170 kind,
171 ReadOpKind::Match | ReadOpKind::Equal | ReadOpKind::Diff
172 )
173 }
174}
175
176#[cfg(test)]
177mod tests {
178 use super::*;
179 use crate::read::aligned_read::Strand;
180
181 impl Genome {
182 fn synthetic_splice_genome() -> Self {
183 let mut seq = vec![b'N'; 300];
188
189 for b in &mut seq[100..110] {
190 *b = b'C';
191 }
192 for b in &mut seq[110..210] {
193 *b = b'A';
194 }
195 for b in &mut seq[210..226] {
196 *b = b'G';
197 }
198
199 Genome::new(vec![("chr1".to_string(), seq)]).unwrap()
200 }
201 }
202
203 impl AlignedRead {
204 fn make_read(seq: &[u8], pairs: Vec<(ReadOpKind, u32)>) -> Self {
205 Self::new(
206 0,
207 Strand::Plus,
208 100,
209 seq.to_vec(),
210 Some(vec![30; seq.len()]),
211 pairs,
212 )
213 }
214 }
215
216 #[test]
217 fn real_internal_mutation_is_not_refined_without_refskip() {
218 let genome = Genome::synthetic_splice_genome();
219
220 let mut read = AlignedRead::make_read(
221 b"CCCCCCCCCCTCCCC",
222 vec![
223 (ReadOpKind::Match, 10),
224 (ReadOpKind::Diff, 1),
225 (ReadOpKind::Match, 4),
226 ],
227 );
228
229 read.refine_against_genome(&genome, RefineOptions::default());
230
231 assert_eq!(
232 read.op_pairs(),
233 vec![
234 (ReadOpKind::Match, 10),
235 (ReadOpKind::Diff, 1),
236 (ReadOpKind::Match, 4),
237 ]
238 );
239
240 let obs = read.base_at_ref_pos(110).unwrap();
241 assert_eq!(obs.base, b'T');
242 assert_eq!(obs.read_pos, 10);
243 }
244
245 #[test]
246 fn diff_before_refskip_is_preserved_if_it_is_real_mutation_not_second_exon_base() {
247 let genome = Genome::synthetic_splice_genome();
248
249 let mut read = AlignedRead::make_read(
250 b"CCCCCCCCCCTGGGGGGGGGGGGGGG",
251 vec![
252 (ReadOpKind::Match, 10),
253 (ReadOpKind::Diff, 1),
254 (ReadOpKind::RefSkip, 100),
255 (ReadOpKind::Match, 15),
256 ],
257 );
258
259 read.refine_against_genome(&genome, RefineOptions::default());
260
261 assert_eq!(
262 read.op_pairs(),
263 vec![
264 (ReadOpKind::Match, 10),
265 (ReadOpKind::Diff, 1),
266 (ReadOpKind::RefSkip, 100),
267 (ReadOpKind::Match, 15),
268 ]
269 );
270
271 let mutation = read.base_at_ref_pos(110).unwrap();
272 assert_eq!(mutation.base, b'T');
273 assert_eq!(mutation.read_pos, 10);
274 }
275
276 #[test]
277 fn diff_after_refskip_is_preserved_if_it_is_not_left_exon_reference() {
278 let genome = Genome::synthetic_splice_genome();
279
280 let mut read = AlignedRead::make_read(
281 b"CCCCCCCCCCTGGGGGGGGGGGGGGG",
282 vec![
283 (ReadOpKind::Match, 10),
284 (ReadOpKind::RefSkip, 100),
285 (ReadOpKind::Diff, 1),
286 (ReadOpKind::Match, 15),
287 ],
288 );
289
290 read.refine_against_genome(&genome, RefineOptions::default());
291
292 assert_eq!(
293 read.op_pairs(),
294 vec![
295 (ReadOpKind::Match, 10),
296 (ReadOpKind::RefSkip, 100),
297 (ReadOpKind::Diff, 1),
298 (ReadOpKind::Match, 15),
299 ]
300 );
301
302 let mutation = read.base_at_ref_pos(210).unwrap();
303 assert_eq!(mutation.base, b'T');
304 assert_eq!(mutation.read_pos, 10);
305 }
306
307 #[test]
308 fn larger_diff_is_not_refined_by_default() {
309 let genome = Genome::synthetic_splice_genome();
310
311 let mut read = AlignedRead::make_read(
312 b"CCCCCCCCCCGGGGGGGGGGGGGGGGG",
313 vec![
314 (ReadOpKind::Match, 10),
315 (ReadOpKind::Diff, 2),
316 (ReadOpKind::RefSkip, 100),
317 (ReadOpKind::Match, 15),
318 ],
319 );
320
321 read.refine_against_genome(&genome, RefineOptions::default());
322
323 assert_eq!(
324 read.op_pairs(),
325 vec![
326 (ReadOpKind::Match, 10),
327 (ReadOpKind::Diff, 2),
328 (ReadOpKind::RefSkip, 100),
329 (ReadOpKind::Match, 15),
330 ]
331 );
332 }
333
334 #[test]
335 fn larger_diff_can_be_refined_when_configured_and_reference_matches() {
336 let genome = Genome::synthetic_splice_genome();
337
338 let mut read = AlignedRead::make_read(
339 b"CCCCCCCCCCGGGGGGGGGGGGGGGGG",
340 vec![
341 (ReadOpKind::Match, 10),
342 (ReadOpKind::Diff, 2),
343 (ReadOpKind::RefSkip, 100),
344 (ReadOpKind::Match, 15),
345 ],
346 );
347
348 read.refine_against_genome(
349 &genome,
350 RefineOptions {
351 max_diff_before_refskip: 2,
352 max_diff_after_refskip: 1,
353 merge_adjacent_ops: true,
354 },
355 );
356
357 assert_eq!(
358 read.op_pairs(),
359 vec![
360 (ReadOpKind::Match, 10),
361 (ReadOpKind::RefSkip, 100),
362 (ReadOpKind::Match, 17),
363 ]
364 );
365
366 assert_eq!(read.base_at_ref_pos(210).unwrap().base, b'G');
367 assert_eq!(read.base_at_ref_pos(211).unwrap().base, b'G');
368 }
369
370 #[test]
371 fn merge_adjacent_op_pairs_collapses_neighbors_and_removes_zero_length_ops() {
372 let pairs = vec![
373 (ReadOpKind::Match, 5),
374 (ReadOpKind::Match, 3),
375 (ReadOpKind::Diff, 0),
376 (ReadOpKind::RefSkip, 100),
377 (ReadOpKind::Match, 2),
378 (ReadOpKind::Match, 4),
379 ];
380
381 let merged = AlignedRead::merge_adjacent_op_pairs(&pairs);
382
383 assert_eq!(
384 merged,
385 vec![
386 (ReadOpKind::Match, 8),
387 (ReadOpKind::RefSkip, 100),
388 (ReadOpKind::Match, 6),
389 ]
390 );
391 }
392}