//! Generic functions for working with (primarily nucleic acid) sequences
use std::borrow::Cow;
use memchr::memchr2;
use crate::bitkmer::BitNuclKmer;
use crate::kmer::{CanonicalKmers, Kmers};
/// Transform a nucleic acid sequence into its "normalized" form.
///
/// The normalized form is:
/// - only AGCTN and possibly - (for gaps)
/// - strip out any whitespace or line endings
/// - lowercase versions of these are uppercased
/// - U is converted to T (make everything a DNA sequence)
/// - some other punctuation is converted to gaps
/// - IUPAC bases may be converted to N's depending on the parameter passed in
/// - everything else is considered a N
pub fn normalize(seq: &[u8], allow_iupac: bool) -> Option<Vec<u8>> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
let mut changed: bool = false;
for n in seq.iter() {
let (new_char, char_changed) = match (*n, allow_iupac) {
c @ (b'A', _)
| c @ (b'C', _)
| c @ (b'G', _)
| c @ (b'T', _)
| c @ (b'N', _)
| c @ (b'-', _) => (c.0, false),
(b'a', _) => (b'A', true),
(b'c', _) => (b'C', true),
(b'g', _) => (b'G', true),
// normalize uridine to thymine
(b't', _) | (b'u', _) | (b'U', _) => (b'T', true),
// normalize gaps
(b'.', _) | (b'~', _) => (b'-', true),
// logic for IUPAC bases (a little messy)
c @ (b'B', true)
| c @ (b'D', true)
| c @ (b'H', true)
| c @ (b'V', true)
| c @ (b'R', true)
| c @ (b'Y', true)
| c @ (b'S', true)
| c @ (b'W', true)
| c @ (b'K', true)
| c @ (b'M', true) => (c.0, false),
(b'b', true) => (b'B', true),
(b'd', true) => (b'D', true),
(b'h', true) => (b'H', true),
(b'v', true) => (b'V', true),
(b'r', true) => (b'R', true),
(b'y', true) => (b'Y', true),
(b's', true) => (b'S', true),
(b'w', true) => (b'W', true),
(b'k', true) => (b'K', true),
(b'm', true) => (b'M', true),
// remove all whitespace and line endings
(b' ', _) | (b'\t', _) | (b'\r', _) | (b'\n', _) => (b' ', true),
// everything else is an N
_ => (b'N', true),
};
changed = changed || char_changed;
if new_char != b' ' {
buf.push(new_char);
}
}
if changed {
Some(buf)
} else {
None
}
}
#[test]
fn test_normalize() {
assert_eq!(normalize(b"ACGTU", false), Some(b"ACGTT".to_vec()));
assert_eq!(normalize(b"acgtu", false), Some(b"ACGTT".to_vec()));
assert_eq!(normalize(b"N.N-N~N N", false), Some(b"N-N-N-NN".to_vec()));
assert_eq!(normalize(b"BDHVRYSWKM", true), None);
assert_eq!(normalize(b"bdhvryswkm", true), Some(b"BDHVRYSWKM".to_vec()));
assert_eq!(
normalize(b"BDHVRYSWKM", false),
Some(b"NNNNNNNNNN".to_vec())
);
assert_eq!(
normalize(b"bdhvryswkm", false),
Some(b"NNNNNNNNNN".to_vec())
);
}
/// Returns the complementary base for a given IUPAC base code.
///
/// Does not work for RNA sequences (maybe we should raise an error or something?)
#[inline]
pub fn complement(n: u8) -> u8 {
match n {
b'a' => b't',
b'A' => b'T',
b'c' => b'g',
b'C' => b'G',
b'g' => b'c',
b'G' => b'C',
b't' => b'a',
b'T' => b'A',
// IUPAC codes
b'r' => b'y',
b'y' => b'r',
b'k' => b'm',
b'm' => b'k',
b'b' => b'v',
b'v' => b'b',
b'd' => b'h',
b'h' => b'd',
b's' => b's',
b'w' => b'w',
b'R' => b'Y',
b'Y' => b'R',
b'K' => b'M',
b'M' => b'K',
b'B' => b'V',
b'V' => b'B',
b'D' => b'H',
b'H' => b'D',
b'S' => b'S',
b'W' => b'W',
// anything else just pass through
// 'u' | 'U' => panic!("Does not support complements of U"),
x => x,
}
}
/// Taking in a sequence string, return the canonical form of the sequence
/// (e.g. the lexigraphically lowest of either the original sequence or its
/// reverse complement)
pub fn canonical(seq: &[u8]) -> Cow<[u8]> {
let mut buf: Vec<u8> = Vec::with_capacity(seq.len());
// enough just keeps our comparisons from happening after they need to
let mut enough = false;
let mut original_was_canonical = false;
// loop through the kmer and its reverse complement simultaneously
for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) {
buf.push(rn);
if !enough && (*n < rn) {
original_was_canonical = true;
break;
} else if !enough && (rn < *n) {
enough = true;
}
// unstated if branch: if rn == n, keep comparing
}
match (original_was_canonical, enough) {
(true, true) => panic!("Bug: should never set original_was_canonical if enough == true"),
(true, false) => seq.into(),
(false, true) => buf.into(),
// the sequences were completely equal, return the ref
(false, false) => seq.into(),
}
}
/// Find the lexigraphically smallest substring of `seq` of length `length`
///
/// There's probably a faster algorithm for this somewhere...
pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> {
let reverse_complement: Vec<u8> = seq.iter().rev().map(|n| complement(*n)).collect();
let mut minmer = Cow::Borrowed(&seq[..length]);
for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) {
if *kmer < minmer[..] {
minmer = kmer.into();
}
if *rc_kmer < minmer[..] {
minmer = rc_kmer.to_vec().into();
}
}
minmer
}
/// A generic FASTX record that also abstracts over several logical operations
/// that can be performed on nucleic acid sequences.
pub trait Sequence<'a> {
fn sequence(&'a self) -> &'a [u8];
/// Remove newlines from the sequence; this handles `\r`, `\n`, and `\r\n`
/// and removes internal newlines in addition to ones at the end.
/// Primarily used for FASTA multiline records, but can also help process
/// (the much rarer) multiline FASTQs. Always use before iteration methods
/// below to ensure no newlines are being returned with e.g. `.kmers`.
fn strip_returns(&'a self) -> Cow<'a, [u8]> {
let seq = self.sequence();
// first part is a fast check to see if we need to do any allocations
let mut i;
match memchr2(b'\r', b'\n', &seq) {
Some(break_loc) => i = break_loc,
None => return seq.into(),
}
// we found a newline; create a new buffer and stripping out newlines
// and writing into it
let mut new_buf = Vec::with_capacity(seq.len() - 1);
new_buf.extend_from_slice(&seq[..i]);
while i < seq.len() {
match memchr2(b'\r', b'\n', &seq[i..]) {
None => {
new_buf.extend_from_slice(&seq[i..]);
break;
}
Some(match_pos) => {
new_buf.extend_from_slice(&seq[i..i + match_pos]);
i += match_pos + 1;
}
}
}
new_buf.into()
}
/// Returns the reverse complement of a sequence. Biologically this is
/// equivalent to the sequence of the strand opposite the one you pass
/// in.
///
/// ```
/// use needletail::Sequence;
///
/// assert_eq!(b"AACC".reverse_complement(), b"GGTT");
/// ```
fn reverse_complement(&'a self) -> Vec<u8> {
self.sequence()
.iter()
.rev()
.map(|n| complement(*n))
.collect()
}
/// [Nucleic Acids] Normalizes the sequence. See documentation for
/// `needletail::sequence::normalize`. Do not use on amino acid
/// sequences. Note that this returns a Cow so you may have to coerce
/// to a Vec<u8> or &[u8] as necessary.
///
/// ```
/// use needletail::Sequence;
///
/// // IUPAC bases are coerced to N's if `false`
/// assert_eq!(b"ADGH".normalize(false).as_ref(), b"ANGN");
/// // otherwise they're preserved
/// assert_eq!(b"ADGH".normalize(true).as_ref(), b"ADGH");
///
/// // Uridine residues are converted to thymidine
/// assert_eq!(b"ACGU".normalize(true).as_ref(), b"ACGT");
/// ```
fn normalize(&'a self, iupac: bool) -> Cow<'a, [u8]> {
if let Some(s) = normalize(&self.sequence(), iupac) {
s.into()
} else {
self.sequence().into()
}
}
/// [Nucleic Acids] Returns an iterator over the sequence that skips
/// non-ACGT bases and returns a tuple containing (position, the
/// canonicalized kmer, if the sequence is the complement of the original).
fn canonical_kmers(&'a self, k: u8, reverse_complement: &'a [u8]) -> CanonicalKmers<'a> {
CanonicalKmers::new(self.sequence().as_ref(), reverse_complement, k)
}
/// Returns an iterator that returns a sliding window of k-sized
/// sequences (k-mers). Does not skip whitespace or correct bases in the
/// original sequence so `.normalize` or `.strip_returns` may be
/// appropriate to use first.
fn kmers(&'a self, k: u8) -> Kmers<'a> {
Kmers::new(self.sequence().as_ref(), k)
}
/// Return an iterator that returns valid kmers in 4-bit form
fn bit_kmers(&'a self, k: u8, canonical: bool) -> BitNuclKmer<'a> {
BitNuclKmer::new(self.sequence(), k, canonical)
}
}
impl<'a> Sequence<'a> for &'a [u8] {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
impl<'a> Sequence<'a> for [u8] {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
impl<'a> Sequence<'a> for Cow<'a, [u8]> {
fn sequence(&'a self) -> &'a [u8] {
&self
}
}
/// [⚠️Unstable] A trait to wrap over sequence data that has associated
/// quality information.
///
/// Will be stabilized once we figure out a good way to handle sequences that
/// have _optional_ quality information (like SequenceRecord) because the
/// return trait requires a slice from an immutable reference and
/// SequenceRecords can't return that without modifying themselves.
pub trait QualitySequence<'a>: Sequence<'a> {
fn quality(&'a self) -> &'a [u8];
/// Given a SeqRecord and a quality cutoff, mask out low-quality bases with
/// `N` characters.
fn quality_mask(&'a self, score: u8) -> Cow<'a, [u8]> {
let qual = self.quality();
// could maybe speed this up by doing a copy of base and then
// iterating though qual and masking?
let seq: Vec<u8> = self
.sequence()
.iter()
.zip(qual.iter())
.map(|(base, qual)| if *qual < score { b'N' } else { *base })
.collect();
seq.into()
}
}
impl<'a> Sequence<'a> for (&'a [u8], &'a [u8]) {
fn sequence(&'a self) -> &'a [u8] {
&self.0
}
}
impl<'a> QualitySequence<'a> for (&'a [u8], &'a [u8]) {
fn quality(&'a self) -> &'a [u8] {
&self.1
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_complement() {
assert_eq!(complement(b'a'), b't');
assert_eq!(complement(b'c'), b'g');
assert_eq!(complement(b'g'), b'c');
assert_eq!(complement(b'n'), b'n');
}
#[test]
fn can_canonicalize() {
assert!(canonical(b"A") == Cow::Borrowed(b"A"));
assert!(canonical(b"T") == Cow::Owned::<[u8]>(b"A".to_vec()));
assert!(canonical(b"AAGT") == Cow::Borrowed(b"AAGT"));
assert!(canonical(b"ACTT") == Cow::Owned::<[u8]>(b"AAGT".to_vec()));
assert!(canonical(b"GC") == Cow::Borrowed(b"GC"));
}
#[test]
fn can_minimize() {
let minmer = minimizer(&b"ATTTCG"[..], 3);
assert_eq!(&minmer[..], b"AAA");
}
#[test]
fn test_quality_mask() {
let seq_rec = (&b"AGCT"[..], &b"AAA0"[..]);
let filtered_rec = seq_rec.quality_mask(b'5');
assert_eq!(&filtered_rec[..], &b"AGCN"[..]);
}
}