use std::borrow::Cow;
use memchr::memchr2;
use crate::bitkmer::BitNuclKmer;
use crate::kmer::{CanonicalKmers, Kmers};
pub fn normalize(seq: &[u8], allow_iupac: bool) -> Option<Vec<u8>> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
let mut changed: bool = false;
for n in seq {
let (new_char, char_changed) = match (*n, allow_iupac) {
c @ (b'A' | b'C' | b'G' | b'T' | b'N' | b'-', _) => (c.0, false),
(b'a', _) => (b'A', true),
(b'c', _) => (b'C', true),
(b'g', _) => (b'G', true),
(b't' | b'u' | b'U', _) => (b'T', true),
(b'.' | b'~', _) => (b'-', true),
c @ (b'B' | b'D' | b'H' | b'V' | b'R' | b'Y' | b'S' | b'W' | b'K' | b'M', true) => {
(c.0, false)
}
(b'b', true) => (b'B', true),
(b'd', true) => (b'D', true),
(b'h', true) => (b'H', true),
(b'v', true) => (b'V', true),
(b'r', true) => (b'R', true),
(b'y', true) => (b'Y', true),
(b's', true) => (b'S', true),
(b'w', true) => (b'W', true),
(b'k', true) => (b'K', true),
(b'm', true) => (b'M', true),
(b' ' | b'\t' | b'\r' | b'\n', _) => (b' ', true),
_ => (b'N', true),
};
changed = changed || char_changed;
if new_char != b' ' {
buf.push(new_char);
}
}
if changed {
Some(buf)
} else {
None
}
}
#[inline]
pub fn complement(n: u8) -> u8 {
match n {
b'a' => b't',
b'A' => b'T',
b'c' => b'g',
b'C' => b'G',
b'g' => b'c',
b'G' => b'C',
b't' => b'a',
b'T' => b'A',
b'r' => b'y',
b'y' => b'r',
b'k' => b'm',
b'm' => b'k',
b'b' => b'v',
b'v' => b'b',
b'd' => b'h',
b'h' => b'd',
b's' => b's',
b'w' => b'w',
b'R' => b'Y',
b'Y' => b'R',
b'K' => b'M',
b'M' => b'K',
b'B' => b'V',
b'V' => b'B',
b'D' => b'H',
b'H' => b'D',
b'S' => b'S',
b'W' => b'W',
x => x,
}
}
pub fn canonical(seq: &[u8]) -> Cow<'_, [u8]> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
let mut enough = false;
let mut original_was_canonical = false;
for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) {
buf.push(rn);
if !enough && (*n < rn) {
original_was_canonical = true;
break;
} else if !enough && (rn < *n) {
enough = true;
}
}
match (original_was_canonical, enough) {
(true, true) => panic!("Bug: should never set original_was_canonical if enough == true"),
(true, false) => seq.into(),
(false, true) => buf.into(),
(false, false) => seq.into(),
}
}
pub fn minimizer(seq: &[u8], length: usize) -> Cow<'_, [u8]> {
let reverse_complement: Vec<u8> = seq.iter().rev().map(|n| complement(*n)).collect();
let mut minmer = Cow::Borrowed(&seq[..length]);
for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) {
if *kmer < minmer[..] {
minmer = kmer.into();
}
if *rc_kmer < minmer[..] {
minmer = rc_kmer.to_vec().into();
}
}
minmer
}
pub trait Sequence<'a> {
fn sequence(&'a self) -> &'a [u8];
fn strip_returns(&'a self) -> Cow<'a, [u8]> {
let seq = self.sequence();
let mut i;
match memchr2(b'\r', b'\n', seq) {
Some(break_loc) => i = break_loc,
None => return seq.into(),
}
let mut new_buf = Vec::with_capacity(seq.len() - 1);
new_buf.extend_from_slice(&seq[..i]);
while i < seq.len() {
match memchr2(b'\r', b'\n', &seq[i..]) {
None => {
new_buf.extend_from_slice(&seq[i..]);
break;
}
Some(match_pos) => {
new_buf.extend_from_slice(&seq[i..i + match_pos]);
i += match_pos + 1;
}
}
}
new_buf.into()
}
fn reverse_complement(&'a self) -> Vec<u8> {
self.sequence()
.iter()
.rev()
.map(|n| complement(*n))
.collect()
}
fn normalize(&'a self, iupac: bool) -> Cow<'a, [u8]> {
if let Some(s) = normalize(self.sequence(), iupac) {
s.into()
} else {
self.sequence().into()
}
}
fn canonical_kmers(&'a self, k: u8, reverse_complement: &'a [u8]) -> CanonicalKmers<'a> {
CanonicalKmers::new(self.sequence(), reverse_complement, k)
}
fn kmers(&'a self, k: u8) -> Kmers<'a> {
Kmers::new(self.sequence(), k)
}
fn bit_kmers(&'a self, k: u8, canonical: bool) -> BitNuclKmer<'a> {
BitNuclKmer::new(self.sequence(), k, canonical)
}
}
impl<'a> Sequence<'a> for &'a [u8] {
fn sequence(&'a self) -> &'a [u8] {
self
}
}
impl<'a> Sequence<'a> for [u8] {
fn sequence(&'a self) -> &'a [u8] {
self
}
}
impl<'a> Sequence<'a> for Cow<'a, [u8]> {
fn sequence(&'a self) -> &'a [u8] {
self
}
}
pub trait QualitySequence<'a>: Sequence<'a> {
fn quality(&'a self) -> &'a [u8];
fn quality_mask(&'a self, score: u8) -> Cow<'a, [u8]> {
let qual = self.quality();
let seq: Vec<u8> = self
.sequence()
.iter()
.zip(qual.iter())
.map(|(base, qual)| if *qual < score { b'N' } else { *base })
.collect();
seq.into()
}
}
impl<'a> Sequence<'a> for (&'a [u8], &'a [u8]) {
fn sequence(&'a self) -> &'a [u8] {
self.0
}
}
impl<'a> QualitySequence<'a> for (&'a [u8], &'a [u8]) {
fn quality(&'a self) -> &'a [u8] {
self.1
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize() {
assert_eq!(
normalize(b"ACGTU", false).as_deref(),
Some(b"ACGTT".as_slice())
);
assert_eq!(
normalize(b"acgtu", false).as_deref(),
Some(b"ACGTT".as_slice())
);
assert_eq!(
normalize(b"N.N-N~N N", false).as_deref(),
Some(b"N-N-N-NN".as_slice())
);
assert_eq!(normalize(b"BDHVRYSWKM", true), None);
assert_eq!(
normalize(b"bdhvryswkm", true).as_deref(),
Some(b"BDHVRYSWKM".as_slice())
);
assert_eq!(
normalize(b"BDHVRYSWKM", false).as_deref(),
Some(b"NNNNNNNNNN".as_slice())
);
assert_eq!(
normalize(b"bdhvryswkm", false).as_deref(),
Some(b"NNNNNNNNNN".as_slice())
);
}
#[test]
fn test_complement() {
assert_eq!(complement(b'a'), b't');
assert_eq!(complement(b'c'), b'g');
assert_eq!(complement(b'g'), b'c');
assert_eq!(complement(b'n'), b'n');
}
#[test]
fn can_canonicalize() {
assert_eq!(canonical(b"A").as_ref(), b"A");
assert_eq!(canonical(b"T").as_ref(), b"A");
assert_eq!(canonical(b"AAGT").as_ref(), b"AAGT");
assert_eq!(canonical(b"ACTT").as_ref(), b"AAGT");
assert_eq!(canonical(b"GC").as_ref(), b"GC");
}
#[test]
fn can_minimize() {
let minmer = minimizer(&b"ATTTCG"[..], 3);
assert_eq!(&minmer[..], b"AAA");
}
#[test]
fn test_quality_mask() {
let seq_rec = (&b"AGCT"[..], &b"AAA0"[..]);
let filtered_rec = seq_rec.quality_mask(b'5');
assert_eq!(&filtered_rec[..], &b"AGCN"[..]);
}
}