use crate::utils;
use std::ops::Range;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Mapping {
seq_interval: Range<usize>,
handle: usize,
node_len: usize,
node_interval: Range<usize>,
edit: Difference,
}
impl Mapping {
pub fn new(seq_offset: usize, handle: usize, node_len: usize, node_offset: usize, edit: Difference) -> Self {
assert!(node_offset + edit.target_len() <= node_len, "The edit extends past the end of the node");
assert!(handle != gbz::ENDMARKER, "A mapping cannot have {} as the target handle", gbz::ENDMARKER);
Self {
seq_interval: seq_offset..seq_offset + edit.query_len(),
handle,
node_len,
node_interval: node_offset..node_offset + edit.target_len(),
edit,
}
}
pub fn unaligned(seq_offset: usize, edit: Difference) -> Self {
assert!(matches!(edit, Difference::Insertion(_)), "An unaligned mapping must have an insertion edit");
Self {
seq_interval: seq_offset..seq_offset + edit.query_len(),
handle: gbz::ENDMARKER,
node_len: 0,
node_interval: 0..0,
edit,
}
}
#[inline]
pub fn seq_interval(&self) -> &Range<usize> {
&self.seq_interval
}
#[inline]
pub fn query_len(&self) -> usize {
self.seq_interval.len()
}
#[inline]
pub fn handle(&self) -> usize {
self.handle
}
#[inline]
pub fn is_unaligned(&self) -> bool {
self.handle == gbz::ENDMARKER
}
#[inline]
pub fn node_len(&self) -> usize {
self.node_len
}
#[inline]
pub fn node_interval(&self) -> &Range<usize> {
&self.node_interval
}
#[inline]
pub fn target_len(&self) -> usize {
self.node_interval.len()
}
#[inline]
pub fn edit(&self) -> &Difference {
&self.edit
}
#[inline]
pub fn follows(&self, other: &Self) -> bool {
self.handle == other.handle && self.node_interval.start == other.node_interval.end
}
#[inline]
pub fn is_at_start(&self) -> bool {
self.node_interval.start == 0
}
#[inline]
pub fn is_at_end(&self) -> bool {
self.node_interval.end == self.node_len
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Difference {
Match(usize),
Mismatch(u8),
Insertion(Vec<u8>),
Deletion(usize),
End,
}
impl Difference {
pub const NUM_TYPES: usize = 5;
pub const UNKNOWN_BASE: u8 = b'X';
const OPS: &'static [u8] = b"=:*+-";
fn base_to_upper(c: u8) -> u8 {
if c.is_ascii_lowercase() {
c - b'a' + b'A'
} else {
c
}
}
fn seq_to_upper(seq: &[u8]) -> Vec<u8> {
seq.iter().map(|&c| Self::base_to_upper(c)).collect()
}
fn matching_sequence(value: &[u8]) -> Option<Self> {
Some(Self::Match(value.len()))
}
fn match_length(value: &[u8]) -> Option<Self> {
let len = str::from_utf8(value).ok()?;
let len = len.parse::<usize>().ok()?;
Some(Self::Match(len))
}
fn mismatch(value: &[u8]) -> Option<Self> {
if value.len() != 2 {
return None;
}
Some(Self::Mismatch(Self::base_to_upper(value[1])))
}
fn insertion(value: &[u8]) -> Option<Self> {
Some(Self::Insertion(Self::seq_to_upper(value)))
}
fn deletion(value: &[u8]) -> Option<Self> {
Some(Self::Deletion(value.len()))
}
pub fn parse(difference_string: &[u8]) -> Result<Vec<Self>, String> {
let mut result: Vec<Self> = Vec::new();
if difference_string.is_empty() {
return Ok(result);
}
if !Self::OPS.contains(&difference_string[0]) {
return Err(format!("Invalid difference string operation: {}", difference_string[0] as char));
}
let mut start = 0;
while start < difference_string.len() {
let mut end = start + 1;
while end < difference_string.len() && !Self::OPS.contains(&difference_string[end]) {
end += 1;
}
let value = &difference_string[start + 1..end];
let op = match difference_string[start] {
b'=' => Self::matching_sequence(value),
b':' => Self::match_length(value),
b'*' => Self::mismatch(value),
b'+' => Self::insertion(value),
b'-' => Self::deletion(value),
_ => return Err(format!("Invalid difference string operation: {}", difference_string[start] as char)),
}.ok_or(format!("Invalid difference string field: {}", String::from_utf8_lossy(&difference_string[start..end])))?;
result.push(op);
start = end;
}
Ok(result)
}
pub fn parse_normalized(difference_string: &[u8]) -> Result<Vec<Self>, String> {
let ops = Self::parse(difference_string)?;
Ok(Self::normalize(ops))
}
pub fn stats(difference_string: &[Self]) -> (usize, usize, usize, usize) {
let mut query_len = 0;
let mut target_len = 0;
let mut matches = 0;
let mut edits = 0;
for diff in difference_string.iter() {
match diff {
Self::Match(len) => {
query_len += len;
target_len += len;
matches += len;
},
Self::Mismatch(_) => {
query_len += 1;
target_len += 1;
edits += 1;
}
Self::Insertion(seq) => {
query_len += seq.len();
edits += seq.len();
}
Self::Deletion(len) => {
target_len += len;
edits += len;
},
Self::End => {},
}
}
(query_len, target_len, matches, edits)
}
pub fn len(&self) -> usize {
match self {
Self::Match(len) => *len,
Self::Mismatch(_) => 1,
Self::Insertion(seq) => seq.len(),
Self::Deletion(len) => *len,
Self::End => 0,
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn target_len(&self) -> usize {
match self {
Self::Match(len) => *len,
Self::Mismatch(_) => 1,
Self::Insertion(_) => 0,
Self::Deletion(len) => *len,
Self::End => 0,
}
}
pub fn query_len(&self) -> usize {
match self {
Self::Match(len) => *len,
Self::Mismatch(_) => 1,
Self::Insertion(seq) => seq.len(),
Self::Deletion(_) => 0,
Self::End => 0,
}
}
pub fn try_merge(&mut self, op: &Self) -> bool {
match (self, op) {
(Self::Match(len1), Self::Match(len2)) => {
*len1 += len2;
true
},
(Self::Insertion(seq1), Self::Insertion(seq2)) => {
seq1.extend_from_slice(seq2);
true
},
(Self::Deletion(len1), Self::Deletion(len2)) => {
*len1 += len2;
true
},
_ => false,
}
}
pub fn normalize(ops: Vec<Self>) -> Vec<Self> {
let mut result = ops;
let mut tail = 0;
for i in 0..result.len() {
if result[i].is_empty() {
continue;
}
if tail > 0 {
let (left, right) = result.split_at_mut(i);
if left[tail - 1].try_merge(&right[0]) {
continue;
}
}
result.swap(tail, i);
tail += 1;
}
result.truncate(tail);
result
}
pub fn to_bytes(ops: &[Difference], target_sequence: &[u8]) -> Vec<u8> {
let mut result = Vec::new();
let mut target_offset = 0;
for op in ops.iter() {
match op {
Self::Match(len) => {
result.push(b':');
utils::append_usize(&mut result, *len);
target_offset += *len;
},
Self::Mismatch(base) => {
result.push(b'*');
result.push(target_sequence[target_offset]);
result.push(*base);
target_offset += 1;
},
Self::Insertion(seq) => {
result.push(b'+');
result.extend_from_slice(seq);
},
Self::Deletion(len) => {
result.push(b'-');
result.extend_from_slice(&target_sequence[target_offset..target_offset + *len]);
target_offset += *len;
},
Self::End => {},
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
fn check_difference(seq: &[u8], truth: &[Difference], name: &str, normalize: bool) {
let result = Difference::parse(seq);
assert!(result.is_ok(), "Failed to parse the difference string for {}: {}", name, result.err().unwrap());
let mut result = result.unwrap();
if normalize {
result = Difference::normalize(result);
}
assert_eq!(result.len(), truth.len(), "Wrong number of operations for {}", name);
for i in 0..truth.len() {
assert_eq!(result[i], truth[i], "Wrong operation at position {} for {}", i, name);
}
}
fn check_to_bytes(ops: &[Difference], truth: &[u8], target_sequence: &[u8], name: &str) {
let result = Difference::to_bytes(ops, target_sequence);
assert_eq!(result, truth, "Wrong byte representation for {}", name);
}
#[test]
fn difference_empty() {
let name = "empty";
let seq = b"";
let truth = [];
check_difference(seq, &truth, name, false);
let target_sequence = b"";
check_to_bytes(&truth, seq, target_sequence, name);
}
#[test]
fn difference_single() {
{
let name = "matching sequence";
let seq = b"=ACGT";
let truth = [ Difference::Match(4) ];
check_difference(seq, &truth, name, false);
let expected = b":4"; let target_sequence = b"ACGT";
check_to_bytes(&truth, expected, target_sequence, name);
}
{
let name = "match length";
let seq = b":5";
let truth = [ Difference::Match(5) ];
check_difference(seq, &truth, name, false);
let target_sequence = b"GATTA";
check_to_bytes(&truth, seq, target_sequence, name);
}
{
let name = "mismatch";
let seq = b"*AC";
let truth = [ Difference::Mismatch(b'C') ];
check_difference(seq, &truth, name, false);
let target_sequence = b"A";
check_to_bytes(&truth, seq, target_sequence, name);
}
{
let name = "insertion";
let seq = b"+ACGT";
let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
check_difference(seq, &truth, name, false);
let target_sequence = b"";
check_to_bytes(&truth, seq, target_sequence, name);
}
{
let name = "deletion";
let seq = b"-ACGT";
let truth = [ Difference::Deletion(4) ];
check_difference(seq, &truth, name, false);
let target_sequence = b"ACGT";
check_to_bytes(&truth, seq, target_sequence, name);
}
}
#[test]
fn difference_multi() {
{
let name = "ERR3239454.129477191 fragment 2";
let seq = b":24*CG:78-C:35*TG:2*TG:1*TG:6";
let truth = [
Difference::Match(24),
Difference::Mismatch(b'G'),
Difference::Match(78),
Difference::Deletion(1),
Difference::Match(35),
Difference::Mismatch(b'G'),
Difference::Match(2),
Difference::Mismatch(b'G'),
Difference::Match(1),
Difference::Mismatch(b'G'),
Difference::Match(6)
];
check_difference(seq, &truth, name, false);
let mut target_sequence = vec![b'-'; 151];
target_sequence[24] = b'C';
target_sequence[103] = b'C';
target_sequence[139] = b'T';
target_sequence[142] = b'T';
target_sequence[144] = b'T';
check_to_bytes(&truth, seq, &target_sequence, name);
}
{
let name = "ERR3239454.11251898 fragment 1";
let seq = b":129+TTGAGGGGGTATAAGAGGTCG";
let truth = [
Difference::Match(129),
Difference::Insertion(b"TTGAGGGGGTATAAGAGGTCG".to_vec())
];
check_difference(seq, &truth, name, false);
let target_sequence = vec![b'-', 129];
check_to_bytes(&truth, seq, &target_sequence, name);
}
{
let name = "ERR3239454.97848632 fragment 1";
let seq = b":111*GT:20*CA:6*GT:2*GT:3*GT:3";
let truth = [
Difference::Match(111),
Difference::Mismatch(b'T'),
Difference::Match(20),
Difference::Mismatch(b'A'),
Difference::Match(6),
Difference::Mismatch(b'T'),
Difference::Match(2),
Difference::Mismatch(b'T'),
Difference::Match(3),
Difference::Mismatch(b'T'),
Difference::Match(3)
];
check_difference(seq, &truth, name, false);
let mut target_sequence = vec![b'-'; 150];
target_sequence[111] = b'G';
target_sequence[132] = b'C';
target_sequence[139] = b'G';
target_sequence[142] = b'G';
target_sequence[146] = b'G';
check_to_bytes(&truth, seq, &target_sequence, name);
}
}
#[test]
fn difference_case() {
{
let seq = b"*ac";
let truth = [ Difference::Mismatch(b'C') ];
check_difference(seq, &truth, "lower case mismatch", false);
}
{
let seq = b"+acgt";
let truth = [ Difference::Insertion(b"ACGT".to_vec()) ];
check_difference(seq, &truth, "lower case insertion", false);
}
}
fn invalid_difference(seq: &[u8], name: &str) {
let result = Difference::parse(seq);
assert!(result.is_err(), "Parsed an invalid difference string: {}", name);
}
#[test]
fn difference_invalid() {
invalid_difference(b"ABC", "invalid operation at start");
invalid_difference(b"+AC:", "empty operation at end");
invalid_difference(b"~AC30AC", "intron / splice operation");
invalid_difference(b":123A", "match length is not an integer");
invalid_difference(b"+GATTACA:", "empty match length");
invalid_difference(b"*ACT", "mismatch is too long");
invalid_difference(b"*A", "mismatch is too short");
}
#[test]
fn difference_len() {
{
let diff = Difference::Match(42);
assert_eq!(diff.len(), 42, "Wrong length for a match");
}
{
let diff = Difference::Mismatch(b'A');
assert_eq!(diff.len(), 1, "Wrong length for a mismatch");
}
{
let diff = Difference::Insertion(b"GATTACA".to_vec());
assert_eq!(diff.len(), 7, "Wrong length for an insertion");
}
{
let diff = Difference::Deletion(42);
assert_eq!(diff.len(), 42, "Wrong length for a deletion");
}
}
#[test]
fn difference_normalize() {
{
let seq = b"=ACGT:4*AC*GT:2:3=ACT*AC-ACGT-ACGT+GATTACA+CAT";
let truth = [
Difference::Match(8),
Difference::Mismatch(b'C'),
Difference::Mismatch(b'T'),
Difference::Match(8),
Difference::Mismatch(b'C'),
Difference::Deletion(8),
Difference::Insertion(b"GATTACACAT".to_vec()),
];
check_difference(seq, &truth, "normalized", true);
}
{
let seq = b"=:0+-*AC:30=GATTA+-:0=";
let truth = [
Difference::Mismatch(b'C'),
Difference::Match(35),
];
check_difference(seq, &truth, "normalized with empty operations", true);
}
}
}