1use crate::utils;
4
5use std::ops::Range;
6
7#[derive(Clone, Debug, PartialEq, Eq)]
13pub struct Mapping {
14 seq_interval: Range<usize>,
16 handle: usize,
18 node_len: usize,
20 node_interval: Range<usize>,
22 edit: Difference,
24}
25
26impl Mapping {
27 pub fn new(seq_offset: usize, handle: usize, node_len: usize, node_offset: usize, edit: Difference) -> Self {
34 assert!(node_offset + edit.target_len() <= node_len, "The edit extends past the end of the node");
35 assert!(handle != gbz::ENDMARKER, "A mapping cannot have {} as the target handle", gbz::ENDMARKER);
36 Self {
37 seq_interval: seq_offset..seq_offset + edit.query_len(),
38 handle,
39 node_len,
40 node_interval: node_offset..node_offset + edit.target_len(),
41 edit,
42 }
43 }
44
45 pub fn unaligned(seq_offset: usize, edit: Difference) -> Self {
51 assert!(matches!(edit, Difference::Insertion(_)), "An unaligned mapping must have an insertion edit");
52 Self {
53 seq_interval: seq_offset..seq_offset + edit.query_len(),
54 handle: gbz::ENDMARKER,
55 node_len: 0,
56 node_interval: 0..0,
57 edit,
58 }
59 }
60
61 #[inline]
63 pub fn seq_interval(&self) -> &Range<usize> {
64 &self.seq_interval
65 }
66
67 #[inline]
69 pub fn query_len(&self) -> usize {
70 self.seq_interval.len()
71 }
72
73 #[inline]
77 pub fn handle(&self) -> usize {
78 self.handle
79 }
80
81 #[inline]
83 pub fn is_unaligned(&self) -> bool {
84 self.handle == gbz::ENDMARKER
85 }
86
87 #[inline]
89 pub fn node_len(&self) -> usize {
90 self.node_len
91 }
92
93 #[inline]
95 pub fn node_interval(&self) -> &Range<usize> {
96 &self.node_interval
97 }
98
99 #[inline]
101 pub fn target_len(&self) -> usize {
102 self.node_interval.len()
103 }
104
105 #[inline]
107 pub fn edit(&self) -> &Difference {
108 &self.edit
109 }
110
111 #[inline]
113 pub fn follows(&self, other: &Self) -> bool {
114 self.handle == other.handle && self.node_interval.start == other.node_interval.end
115 }
116
117 #[inline]
119 pub fn is_at_start(&self) -> bool {
120 self.node_interval.start == 0
121 }
122
123 #[inline]
125 pub fn is_at_end(&self) -> bool {
126 self.node_interval.end == self.node_len
127 }
128}
129
130#[derive(Clone, Debug, PartialEq, Eq)]
163pub enum Difference {
164 Match(usize),
166 Mismatch(u8),
168 Insertion(Vec<u8>),
170 Deletion(usize),
172 End,
174}
175
176impl Difference {
177 pub const NUM_TYPES: usize = 5;
179
180 pub const UNKNOWN_BASE: u8 = b'X';
182
183 const OPS: &'static [u8] = b"=:*+-";
185
186 fn base_to_upper(c: u8) -> u8 {
187 if c.is_ascii_lowercase() {
188 c - b'a' + b'A'
189 } else {
190 c
191 }
192 }
193
194 fn seq_to_upper(seq: &[u8]) -> Vec<u8> {
195 seq.iter().map(|&c| Self::base_to_upper(c)).collect()
196 }
197
198 fn matching_sequence(value: &[u8]) -> Option<Self> {
199 Some(Self::Match(value.len()))
200 }
201
202 fn match_length(value: &[u8]) -> Option<Self> {
203 let len = str::from_utf8(value).ok()?;
204 let len = len.parse::<usize>().ok()?;
205 Some(Self::Match(len))
206 }
207
208 fn mismatch(value: &[u8]) -> Option<Self> {
209 if value.len() != 2 {
210 return None;
211 }
212 Some(Self::Mismatch(Self::base_to_upper(value[1])))
213 }
214
215 fn insertion(value: &[u8]) -> Option<Self> {
216 Some(Self::Insertion(Self::seq_to_upper(value)))
217 }
218
219 fn deletion(value: &[u8]) -> Option<Self> {
220 Some(Self::Deletion(value.len()))
221 }
222
223 pub fn parse(difference_string: &[u8]) -> Result<Vec<Self>, String> {
227 let mut result: Vec<Self> = Vec::new();
228 if difference_string.is_empty() {
229 return Ok(result);
230 }
231 if !Self::OPS.contains(&difference_string[0]) {
232 return Err(format!("Invalid difference string operation: {}", difference_string[0] as char));
233 }
234
235 let mut start = 0;
236 while start < difference_string.len() {
237 let mut end = start + 1;
238 while end < difference_string.len() && !Self::OPS.contains(&difference_string[end]) {
239 end += 1;
240 }
241 let value = &difference_string[start + 1..end];
242 let op = match difference_string[start] {
243 b'=' => Self::matching_sequence(value),
244 b':' => Self::match_length(value),
245 b'*' => Self::mismatch(value),
246 b'+' => Self::insertion(value),
247 b'-' => Self::deletion(value),
248 _ => return Err(format!("Invalid difference string operation: {}", difference_string[start] as char)),
249 }.ok_or_else(|| format!("Invalid difference string field: {}", String::from_utf8_lossy(&difference_string[start..end])))?;
250 result.push(op);
251 start = end;
252 }
253
254 Ok(result)
255 }
256
257 pub fn parse_normalized(difference_string: &[u8]) -> Result<Vec<Self>, String> {
262 let ops = Self::parse(difference_string)?;
263 Ok(Self::normalize(ops))
264 }
265
266 pub fn stats(difference_string: &[Self]) -> (usize, usize, usize, usize) {
270 let mut query_len = 0;
271 let mut target_len = 0;
272 let mut matches = 0;
273 let mut edits = 0;
274 for diff in difference_string.iter() {
275 match diff {
276 Self::Match(len) => {
277 query_len += len;
278 target_len += len;
279 matches += len;
280 },
281 Self::Mismatch(_) => {
282 query_len += 1;
283 target_len += 1;
284 edits += 1;
285 }
286 Self::Insertion(seq) => {
287 query_len += seq.len();
288 edits += seq.len();
289 }
290 Self::Deletion(len) => {
291 target_len += len;
292 edits += len;
293 },
294 Self::End => {},
295 }
296 }
297 (query_len, target_len, matches, edits)
298 }
299
300 pub fn len(&self) -> usize {
302 match self {
303 Self::Match(len) => *len,
304 Self::Mismatch(_) => 1,
305 Self::Insertion(seq) => seq.len(),
306 Self::Deletion(len) => *len,
307 Self::End => 0,
308 }
309 }
310
311 pub fn is_empty(&self) -> bool {
313 self.len() == 0
314 }
315
316 pub fn target_len(&self) -> usize {
318 match self {
319 Self::Match(len) => *len,
320 Self::Mismatch(_) => 1,
321 Self::Insertion(_) => 0,
322 Self::Deletion(len) => *len,
323 Self::End => 0,
324 }
325 }
326
327 pub fn query_len(&self) -> usize {
329 match self {
330 Self::Match(len) => *len,
331 Self::Mismatch(_) => 1,
332 Self::Insertion(seq) => seq.len(),
333 Self::Deletion(_) => 0,
334 Self::End => 0,
335 }
336 }
337
338 pub fn try_merge(&mut self, op: &Self) -> bool {
342 match (self, op) {
343 (Self::Match(len1), Self::Match(len2)) => {
344 *len1 += len2;
345 true
346 },
347 (Self::Insertion(seq1), Self::Insertion(seq2)) => {
348 seq1.extend_from_slice(seq2);
349 true
350 },
351 (Self::Deletion(len1), Self::Deletion(len2)) => {
352 *len1 += len2;
353 true
354 },
355 _ => false,
356 }
357 }
358
359 pub fn normalize(ops: Vec<Self>) -> Vec<Self> {
363 let mut result = ops;
364 let mut tail = 0;
365 for i in 0..result.len() {
366 if result[i].is_empty() {
367 continue;
368 }
369 if tail > 0 {
370 let (left, right) = result.split_at_mut(i);
372 if left[tail - 1].try_merge(&right[0]) {
373 continue;
374 }
375 }
376 result.swap(tail, i);
377 tail += 1;
378 }
379
380 result.truncate(tail);
381 result
382 }
383
384 pub fn to_bytes(ops: &[Difference], target_sequence: &[u8]) -> Vec<u8> {
386 let mut result = Vec::new();
387 let mut target_offset = 0;
388 for op in ops.iter() {
389 match op {
390 Self::Match(len) => {
391 result.push(b':');
392 utils::append_usize(&mut result, *len);
393 target_offset += *len;
394 },
395 Self::Mismatch(base) => {
396 result.push(b'*');
397 result.push(target_sequence[target_offset]);
398 result.push(*base);
399 target_offset += 1;
400 },
401 Self::Insertion(seq) => {
402 result.push(b'+');
403 result.extend_from_slice(seq);
404 },
405 Self::Deletion(len) => {
406 result.push(b'-');
407 result.extend_from_slice(&target_sequence[target_offset..target_offset + *len]);
408 target_offset += *len;
409 },
410 Self::End => {},
411 }
412 }
413 result
414 }
415}
416
417#[cfg(test)]
420mod tests {
421
422use super::*;
423
424fn check_difference(seq: &[u8], truth: &[Difference], name: &str, normalize: bool) {
427 let result = Difference::parse(seq);
428 assert!(result.is_ok(), "Failed to parse the difference string for {}: {}", name, result.err().unwrap());
429 let mut result = result.unwrap();
430 if normalize {
431 result = Difference::normalize(result);
432 }
433 assert_eq!(result.len(), truth.len(), "Wrong number of operations for {}", name);
434 for i in 0..truth.len() {
435 assert_eq!(result[i], truth[i], "Wrong operation at position {} for {}", i, name);
436 }
437}
438
439fn check_to_bytes(ops: &[Difference], truth: &[u8], target_sequence: &[u8], name: &str) {
440 let result = Difference::to_bytes(ops, target_sequence);
441 assert_eq!(result, truth, "Wrong byte representation for {}", name);
442}
443
444#[test]
445fn difference_empty() {
446 let name = "empty";
447 let seq = b"";
448 let truth = [];
449 check_difference(seq, &truth, name, false);
450 let target_sequence = b"";
451 check_to_bytes(&truth, seq, target_sequence, name);
452}
453
454#[test]
455fn difference_single() {
456 {
457 let name = "matching sequence";
458 let seq = b"=ACGT";
459 let truth = [ Difference::Match(4) ];
460 check_difference(seq, &truth, name, false);
461 let expected = b":4"; let target_sequence = b"ACGT";
463 check_to_bytes(&truth, expected, target_sequence, name);
464 }
465 {
466 let name = "match length";
467 let seq = b":5";
468 let truth = [ Difference::Match(5) ];
469 check_difference(seq, &truth, name, false);
470 let target_sequence = b"GATTA";
471 check_to_bytes(&truth, seq, target_sequence, name);
472 }
473 {
474 let name = "mismatch";
475 let seq = b"*AC";
476 let truth = [ Difference::Mismatch(b'C') ];
477 check_difference(seq, &truth, name, false);
478 let target_sequence = b"A";
479 check_to_bytes(&truth, seq, target_sequence, name);
480 }
481 {
482 let name = "insertion";
483 let seq = b"+ACGT";
484 let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
485 check_difference(seq, &truth, name, false);
486 let target_sequence = b"";
487 check_to_bytes(&truth, seq, target_sequence, name);
488 }
489 {
490 let name = "deletion";
491 let seq = b"-ACGT";
492 let truth = [ Difference::Deletion(4) ];
493 check_difference(seq, &truth, name, false);
494 let target_sequence = b"ACGT";
495 check_to_bytes(&truth, seq, target_sequence, name);
496 }
497}
498
499#[test]
500fn difference_multi() {
501 {
504 let name = "ERR3239454.129477191 fragment 2";
505 let seq = b":24*CG:78-C:35*TG:2*TG:1*TG:6";
506 let truth = [
507 Difference::Match(24),
508 Difference::Mismatch(b'G'),
509 Difference::Match(78),
510 Difference::Deletion(1),
511 Difference::Match(35),
512 Difference::Mismatch(b'G'),
513 Difference::Match(2),
514 Difference::Mismatch(b'G'),
515 Difference::Match(1),
516 Difference::Mismatch(b'G'),
517 Difference::Match(6)
518 ];
519 check_difference(seq, &truth, name, false);
520 let mut target_sequence = vec![b'-'; 151];
521 target_sequence[24] = b'C';
522 target_sequence[103] = b'C';
523 target_sequence[139] = b'T';
524 target_sequence[142] = b'T';
525 target_sequence[144] = b'T';
526 check_to_bytes(&truth, seq, &target_sequence, name);
527 }
528 {
529 let name = "ERR3239454.11251898 fragment 1";
530 let seq = b":129+TTGAGGGGGTATAAGAGGTCG";
531 let truth = [
532 Difference::Match(129),
533 Difference::Insertion(b"TTGAGGGGGTATAAGAGGTCG".to_vec())
534 ];
535 check_difference(seq, &truth, name, false);
536 let target_sequence = vec![b'-', 129];
537 check_to_bytes(&truth, seq, &target_sequence, name);
538 }
539 {
540 let name = "ERR3239454.97848632 fragment 1";
541 let seq = b":111*GT:20*CA:6*GT:2*GT:3*GT:3";
542 let truth = [
543 Difference::Match(111),
544 Difference::Mismatch(b'T'),
545 Difference::Match(20),
546 Difference::Mismatch(b'A'),
547 Difference::Match(6),
548 Difference::Mismatch(b'T'),
549 Difference::Match(2),
550 Difference::Mismatch(b'T'),
551 Difference::Match(3),
552 Difference::Mismatch(b'T'),
553 Difference::Match(3)
554 ];
555 check_difference(seq, &truth, name, false);
556 let mut target_sequence = vec![b'-'; 150];
557 target_sequence[111] = b'G';
558 target_sequence[132] = b'C';
559 target_sequence[139] = b'G';
560 target_sequence[142] = b'G';
561 target_sequence[146] = b'G';
562 check_to_bytes(&truth, seq, &target_sequence, name);
563 }
564}
565
566#[test]
567fn difference_case() {
568 {
569 let seq = b"*ac";
571 let truth = [ Difference::Mismatch(b'C') ];
572 check_difference(seq, &truth, "lower case mismatch", false);
573 }
574 {
575 let seq = b"+acgt";
577 let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
578 check_difference(seq, &truth, "lower case insertion", false);
579 }
580}
581
582fn invalid_difference(seq: &[u8], name: &str) {
583 let result = Difference::parse(seq);
584 assert!(result.is_err(), "Parsed an invalid difference string: {}", name);
585}
586
587#[test]
588fn difference_invalid() {
589 invalid_difference(b"ABC", "invalid operation at start");
590 invalid_difference(b"+AC:", "empty operation at end");
591 invalid_difference(b"~AC30AC", "intron / splice operation");
592 invalid_difference(b":123A", "match length is not an integer");
593 invalid_difference(b"+GATTACA:", "empty match length");
594 invalid_difference(b"*ACT", "mismatch is too long");
595 invalid_difference(b"*A", "mismatch is too short");
596}
597
598#[test]
599fn difference_len() {
600 {
601 let diff = Difference::Match(42);
602 assert_eq!(diff.len(), 42, "Wrong length for a match");
603 }
604 {
605 let diff = Difference::Mismatch(b'A');
606 assert_eq!(diff.len(), 1, "Wrong length for a mismatch");
607 }
608 {
609 let diff = Difference::Insertion(b"GATTACA".to_vec());
610 assert_eq!(diff.len(), 7, "Wrong length for an insertion");
611 }
612 {
613 let diff = Difference::Deletion(42);
614 assert_eq!(diff.len(), 42, "Wrong length for a deletion");
615 }
616}
617
618#[test]
619fn difference_normalize() {
620 {
621 let seq = b"=ACGT:4*AC*GT:2:3=ACT*AC-ACGT-ACGT+GATTACA+CAT";
622 let truth = [
623 Difference::Match(8),
624 Difference::Mismatch(b'C'),
625 Difference::Mismatch(b'T'),
626 Difference::Match(8),
627 Difference::Mismatch(b'C'),
628 Difference::Deletion(8),
629 Difference::Insertion(b"GATTACACAT".to_vec()),
630 ];
631 check_difference(seq, &truth, "normalized", true);
632 }
633 {
634 let seq = b"=:0+-*AC:30=GATTA+-:0=";
635 let truth = [
636 Difference::Mismatch(b'C'),
637 Difference::Match(35),
638 ];
639 check_difference(seq, &truth, "normalized with empty operations", true);
640 }
641}
642
643}
644
645