use std::borrow::Cow;
use memchr::memchr2;
use crate::bitkmer::BitNuclKmer;
use crate::kmer::{CanonicalKmers, Kmers};
pub fn normalize(seq: &[u8], allow_iupac: bool) -> Option<Vec<u8>> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
let mut changed: bool = false;
for n in seq.iter() {
let (new_char, char_changed) = match (*n, allow_iupac) {
c @ (b'A', _)
| c @ (b'C', _)
| c @ (b'G', _)
| c @ (b'T', _)
| c @ (b'N', _)
| c @ (b'-', _) => (c.0, false),
(b'a', _) => (b'A', true),
(b'c', _) => (b'C', true),
(b'g', _) => (b'G', true),
(b't', _) | (b'u', _) | (b'U', _) => (b'T', true),
(b'.', _) | (b'~', _) => (b'-', true),
c @ (b'B', true)
| c @ (b'D', true)
| c @ (b'H', true)
| c @ (b'V', true)
| c @ (b'R', true)
| c @ (b'Y', true)
| c @ (b'S', true)
| c @ (b'W', true)
| c @ (b'K', true)
| c @ (b'M', true) => (c.0, false),
(b'b', true) => (b'B', true),
(b'd', true) => (b'D', true),
(b'h', true) => (b'H', true),
(b'v', true) => (b'V', true),
(b'r', true) => (b'R', true),
(b'y', true) => (b'Y', true),
(b's', true) => (b'S', true),
(b'w', true) => (b'W', true),
(b'k', true) => (b'K', true),
(b'm', true) => (b'M', true),
(b' ', _) | (b'\t', _) | (b'\r', _) | (b'\n', _) => (b' ', true),
_ => (b'N', true),
};
changed = changed || char_changed;
if new_char != b' ' {
buf.push(new_char);
}
}
if changed {
Some(buf)
} else {
None
}
}
#[test]
fn test_normalize() {
assert_eq!(normalize(b"ACGTU", false), Some(b"ACGTT".to_vec()));
assert_eq!(normalize(b"acgtu", false), Some(b"ACGTT".to_vec()));
assert_eq!(normalize(b"N.N-N~N N", false), Some(b"N-N-N-NN".to_vec()));
assert_eq!(normalize(b"BDHVRYSWKM", true), None);
assert_eq!(normalize(b"bdhvryswkm", true), Some(b"BDHVRYSWKM".to_vec()));
assert_eq!(
normalize(b"BDHVRYSWKM", false),
Some(b"NNNNNNNNNN".to_vec())
);
assert_eq!(
normalize(b"bdhvryswkm", false),
Some(b"NNNNNNNNNN".to_vec())
);
}
#[inline]
pub fn complement(n: u8) -> u8 {
match n {
b'a' => b't',
b'A' => b'T',
b'c' => b'g',
b'C' => b'G',
b'g' => b'c',
b'G' => b'C',
b't' => b'a',
b'T' => b'A',
b'r' => b'y',
b'y' => b'r',
b'k' => b'm',
b'm' => b'k',
b'b' => b'v',
b'v' => b'b',
b'd' => b'h',
b'h' => b'd',
b's' => b's',
b'w' => b'w',
b'R' => b'Y',
b'Y' => b'R',
b'K' => b'M',
b'M' => b'K',
b'B' => b'V',
b'V' => b'B',
b'D' => b'H',
b'H' => b'D',
b'S' => b'S',
b'W' => b'W',
x => x,
}
}
pub fn canonical(seq: &[u8]) -> Cow<[u8]> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
let mut enough = false;
let mut original_was_canonical = false;
for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) {
buf.push(rn);
if !enough && (*n < rn) {
original_was_canonical = true;
break;
} else if !enough && (rn < *n) {
enough = true;
}
}
match (original_was_canonical, enough) {
(true, true) => panic!("Bug: should never set original_was_canonical if enough == true"),
(true, false) => seq.into(),
(false, true) => buf.into(),
(false, false) => seq.into(),
}
}
pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> {
let reverse_complement: Vec<u8> = seq.iter().rev().map(|n| complement(*n)).collect();
let mut minmer = Cow::Borrowed(&seq[..length]);
for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) {
if *kmer < minmer[..] {
minmer = kmer.into();
}
if *rc_kmer < minmer[..] {
minmer = rc_kmer.to_vec().into();
}
}
minmer
}
pub trait Sequence<'a> {
fn sequence(&'a self) -> &'a [u8];
fn strip_returns(&'a self) -> Cow<'a, [u8]> {
let seq = self.sequence();
let mut i;
match memchr2(b'\r', b'\n', &seq) {
Some(break_loc) => i = break_loc,
None => return seq.into(),
}
let mut new_buf = Vec::with_capacity(seq.len() - 1);
new_buf.extend_from_slice(&seq[..i]);
while i < seq.len() {
match memchr2(b'\r', b'\n', &seq[i..]) {
None => {
new_buf.extend_from_slice(&seq[i..]);
break;
}
Some(match_pos) => {
new_buf.extend_from_slice(&seq[i..i + match_pos]);
i += match_pos + 1;
}
}
}
new_buf.into()
}
fn reverse_complement(&'a self) -> Vec<u8> {
self.sequence()
.iter()
.rev()
.map(|n| complement(*n))
.collect()
}
fn normalize(&'a self, iupac: bool) -> Cow<'a, [u8]> {
if let Some(s) = normalize(&self.sequence(), iupac) {
s.into()
} else {
self.sequence().into()
}
}
fn canonical_kmers(&'a self, k: u8, reverse_complement: &'a [u8]) -> CanonicalKmers<'a> {
CanonicalKmers::new(self.sequence().as_ref(), reverse_complement, k)
}
fn kmers(&'a self, k: u8) -> Kmers<'a> {
Kmers::new(self.sequence().as_ref(), k)
}
fn bit_kmers(&'a self, k: u8, canonical: bool) -> BitNuclKmer<'a> {
BitNuclKmer::new(self.sequence(), k, canonical)
}
}
impl<'a> Sequence<'a> for &'a [u8] {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
impl<'a> Sequence<'a> for [u8] {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
impl<'a> Sequence<'a> for Cow<'a, [u8]> {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
pub trait QualitySequence<'a>: Sequence<'a> {
fn quality(&'a self) -> &'a [u8];
fn quality_mask(&'a self, score: u8) -> Cow<'a, [u8]> {
let qual = self.quality();
let seq: Vec<u8> = self
.sequence()
.iter()
.zip(qual.iter())
.map(|(base, qual)| if *qual < score { b'N' } else { *base })
.collect();
seq.into()
}
}
impl<'a> Sequence<'a> for (&'a [u8], &'a [u8]) {
fn sequence(&'a self) -> &'a [u8] {
&self.0
}
}
impl<'a> QualitySequence<'a> for (&'a [u8], &'a [u8]) {
fn quality(&'a self) -> &'a [u8] {
&self.1
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_complement() {
assert_eq!(complement(b'a'), b't');
assert_eq!(complement(b'c'), b'g');
assert_eq!(complement(b'g'), b'c');
assert_eq!(complement(b'n'), b'n');
}
#[test]
fn can_canonicalize() {
assert!(canonical(b"A") == Cow::Borrowed(b"A"));
assert!(canonical(b"T") == Cow::Owned::<[u8]>(b"A".to_vec()));
assert!(canonical(b"AAGT") == Cow::Borrowed(b"AAGT"));
assert!(canonical(b"ACTT") == Cow::Owned::<[u8]>(b"AAGT".to_vec()));
assert!(canonical(b"GC") == Cow::Borrowed(b"GC"));
}
#[test]
fn can_minimize() {
let minmer = minimizer(&b"ATTTCG"[..], 3);
assert_eq!(&minmer[..], b"AAA");
}
#[test]
fn test_quality_mask() {
let seq_rec = (&b"AGCT"[..], &b"AAA0"[..]);
let filtered_rec = seq_rec.quality_mask(b'5');
assert_eq!(&filtered_rec[..], &b"AGCN"[..]);
}
}