use core::str::FromStr;
use std::convert::TryFrom;
use std::fmt;
use crate::fasta::FastaReader;
use crate::models::transcript::CoordinateVector;
use crate::utils::errors::{AtgError, FastaError};
const UPPERCASE_A: u8 = 0x41;
const UPPERCASE_C: u8 = 0x43;
const UPPERCASE_G: u8 = 0x47;
const UPPERCASE_T: u8 = 0x54;
const UPPERCASE_N: u8 = 0x4e;
const LOWERCASE_A: u8 = 0x61;
const LOWERCASE_C: u8 = 0x63;
const LOWERCASE_G: u8 = 0x67;
const LOWERCASE_T: u8 = 0x74;
const LOWERCASE_N: u8 = 0x64;
const NCBI_T: u8 = 0u8;
const NCBI_C: u8 = 1u8;
const NCBI_A: u8 = 2u8;
const NCBI_G: u8 = 3u8;
const LF: u8 = 0xa;
const CR: u8 = 0xd;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Nucleotide {
A,
C,
G,
T,
N,
}
impl Nucleotide {
pub fn complement(&self) -> Self {
match self {
Self::A => Self::T,
Self::C => Self::G,
Self::G => Self::C,
Self::T => Self::A,
Self::N => Self::N,
}
}
pub fn to_bytes(self) -> u8 {
match self {
Self::A => UPPERCASE_A,
Self::C => UPPERCASE_C,
Self::G => UPPERCASE_G,
Self::T => UPPERCASE_T,
Self::N => UPPERCASE_N,
}
}
pub fn to_str(self) -> &'static str {
match self {
Self::A => "A",
Self::C => "C",
Self::G => "G",
Self::T => "T",
Self::N => "N",
}
}
pub fn as_ncbi_int(&self) -> Result<usize, AtgError> {
match self {
Nucleotide::A => Ok(NCBI_A.into()),
Nucleotide::C => Ok(NCBI_C.into()),
Nucleotide::G => Ok(NCBI_G.into()),
Nucleotide::T => Ok(NCBI_T.into()),
Nucleotide::N => Err(AtgError::new("N nucleotides cannot be converted to `int`")),
}
}
}
impl FromStr for Nucleotide {
type Err = AtgError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"a" | "A" => Ok(Self::A),
"c" | "C" => Ok(Self::C),
"g" | "G" => Ok(Self::G),
"t" | "T" => Ok(Self::T),
"n" | "N" => Ok(Self::N),
_ => Err(AtgError::new("Invalid nucleotide")),
}
}
}
impl fmt::Display for Nucleotide {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
match self {
Self::A => "A",
Self::C => "C",
Self::G => "G",
Self::T => "T",
Self::N => "N",
}
)
}
}
impl TryFrom<&char> for Nucleotide {
type Error = AtgError;
fn try_from(c: &char) -> Result<Self, Self::Error> {
match c {
'a' | 'A' => Ok(Self::A),
'c' | 'C' => Ok(Self::C),
'g' | 'G' => Ok(Self::G),
't' | 'T' => Ok(Self::T),
'n' | 'N' => Ok(Self::N),
'\n' | '\r' => Err(AtgError::new("newline")),
_ => panic!("invalid nucleotide {}", c),
}
}
}
impl TryFrom<&u8> for Nucleotide {
type Error = AtgError;
fn try_from(b: &u8) -> Result<Nucleotide, AtgError> {
match b {
&LOWERCASE_A | &UPPERCASE_A | &NCBI_A => Ok(Self::A),
&LOWERCASE_C | &UPPERCASE_C | &NCBI_C => Ok(Self::C),
&LOWERCASE_G | &UPPERCASE_G | &NCBI_G => Ok(Self::G),
&LOWERCASE_T | &UPPERCASE_T | &NCBI_T => Ok(Self::T),
&LOWERCASE_N | &UPPERCASE_N => Ok(Self::N),
&LF | &CR => Err(AtgError::new("newline")),
_ => panic!("invalid nucleotide {}", b),
}
}
}
impl TryFrom<u8> for Nucleotide {
type Error = AtgError;
fn try_from(b: u8) -> Result<Nucleotide, AtgError> {
Nucleotide::try_from(&b)
}
}
impl From<&Nucleotide> for char {
fn from(n: &Nucleotide) -> Self {
match n {
Nucleotide::A => 'A',
Nucleotide::C => 'C',
Nucleotide::G => 'G',
Nucleotide::T => 'T',
Nucleotide::N => 'N',
}
}
}
impl From<&Nucleotide> for u8 {
fn from(n: &Nucleotide) -> u8 {
n.to_bytes()
}
}
pub struct Sequence {
sequence: Vec<Nucleotide>,
}
impl FromStr for Sequence {
type Err = AtgError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut sequence: Vec<Nucleotide> = vec![];
for c in s.chars() {
sequence.push(Nucleotide::try_from(&c)?)
}
Ok(Self { sequence })
}
}
impl fmt::Display for Sequence {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut s = String::with_capacity(self.len());
for n in &self.sequence {
s.push(n.into())
}
write!(f, "{}", s)
}
}
impl Default for Sequence {
fn default() -> Self {
Self::new()
}
}
impl Sequence {
pub fn new() -> Self {
Sequence {
sequence: Vec::new(),
}
}
pub fn with_capacity(capacity: usize) -> Self {
Sequence {
sequence: Vec::with_capacity(capacity),
}
}
pub fn from_raw_bytes(bytes: &[u8], len: usize) -> Result<Self, AtgError> {
let mut seq = Self::with_capacity(len);
for b in bytes {
if let Ok(n) = Nucleotide::try_from(b) {
seq.push(n)?
}
}
Ok(seq)
}
pub fn from_coordinates<R>(
coordinates: &CoordinateVector,
strand: &crate::models::Strand,
fasta_reader: &mut FastaReader<R>,
) -> Result<Sequence, FastaError>
where
R: std::io::Seek + std::io::Read,
{
let capacity: u32 = coordinates.iter().map(|x| x.2 - x.1 + 1).sum();
let mut seq = Sequence::with_capacity(capacity as usize);
for segment in coordinates {
seq.append(fasta_reader.read_sequence(segment.0, segment.1.into(), segment.2.into())?)
}
if strand == &crate::models::Strand::Minus {
seq.reverse_complement()
}
Ok(seq)
}
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn push_char(&mut self, c: &char) -> Result<(), AtgError> {
self.sequence.push(Nucleotide::try_from(c)?);
Ok(())
}
pub fn push(&mut self, n: Nucleotide) -> Result<(), AtgError> {
self.sequence.push(n);
Ok(())
}
pub fn clear(&mut self) {
self.sequence.clear()
}
pub fn append(&mut self, other: Sequence) {
self.sequence.append(&mut other.into_inner())
}
fn into_inner(self) -> Vec<Nucleotide> {
self.sequence
}
pub fn complement(&mut self) {
for n in &mut self.sequence {
*n = n.complement();
}
}
pub fn reverse(&mut self) {
self.sequence.reverse()
}
pub fn reverse_complement(&mut self) {
self.reverse();
self.complement();
}
pub fn to_bytes(&self) -> Vec<u8> {
self.sequence.iter().map(|n| n.to_bytes()).collect()
}
pub fn write_into_string(&self, target: &mut String) {
for c in &self.sequence {
target.push(c.into())
}
}
pub fn chunks(&self, chunk_size: usize) -> std::slice::Chunks<'_, Nucleotide> {
self.sequence.chunks(chunk_size)
}
pub fn position<T>(&self, other: T) -> Option<usize>
where
T: AsRef<[Nucleotide]>,
{
let query = other.as_ref();
assert!(
!query.is_empty(),
"empty sequence was passed to Sequence::position"
);
for (i, nuc) in self.sequence[0..self.sequence.len() - query.len() + 1]
.iter()
.enumerate()
{
if nuc == &query[0] {
let subsequence = &self.sequence[i..i + query.len()];
if subsequence == query {
return Some(i);
}
}
}
None
}
pub fn contains<T>(&self, other: T) -> bool
where
T: AsRef<[Nucleotide]>,
{
self.position(other).is_some()
}
pub fn equals<T>(&self, other: T) -> bool
where
T: AsRef<[Nucleotide]>,
{
let query = other.as_ref();
if query.len() != self.len() {
return false;
}
for (a, b) in query.iter().zip(self.sequence.iter()) {
if a != b {
return false;
}
}
true
}
}
impl AsRef<[Nucleotide]> for Sequence {
fn as_ref(&self) -> &[Nucleotide] {
&self.sequence
}
}
impl From<Sequence> for Vec<u8> {
fn from(s: Sequence) -> Vec<u8> {
s.to_bytes()
}
}
impl<Idx> std::ops::Index<Idx> for Sequence
where
Idx: std::slice::SliceIndex<[Nucleotide]>,
{
type Output = Idx::Output;
fn index(&self, idx: Idx) -> &Self::Output {
&self.sequence[idx]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_sequence() {
let s = "ATCGACGATCGATCGATGAGCGATCGACGATCGCGCTATCGCTA";
let seq = Sequence::from_str(&s).unwrap();
assert_eq!(seq.len(), 44);
assert_eq!(seq.to_string(), s.to_string())
}
#[test]
fn test_chunks() {
let s = "ATCGACGATCGATCGATGAGCGATCGACGATCGCGCTATCGCTA";
let seq = Sequence::from_str(&s).unwrap();
let mut iter = seq.chunks(3);
assert_eq!(
iter.next().unwrap(),
&[Nucleotide::A, Nucleotide::T, Nucleotide::C]
);
assert_eq!(
iter.next().unwrap(),
&[Nucleotide::G, Nucleotide::A, Nucleotide::C]
);
}
#[test]
fn test_contains() {
let s = "ATGCGA";
let seq = Sequence::from_str(&s).unwrap();
assert_eq!(seq.contains(vec![Nucleotide::A]), true);
assert_eq!(seq.contains(vec![Nucleotide::C]), true);
assert_eq!(seq.contains(vec![Nucleotide::G]), true);
assert_eq!(seq.contains(vec![Nucleotide::T]), true);
assert_eq!(seq.contains(vec![Nucleotide::N]), false);
assert_eq!(seq.contains(vec![Nucleotide::A, Nucleotide::T]), true);
assert_eq!(seq.contains(vec![Nucleotide::T, Nucleotide::G]), true);
assert_eq!(seq.contains(vec![Nucleotide::A, Nucleotide::C]), false);
assert_eq!(seq.contains(vec![Nucleotide::G, Nucleotide::T]), false);
assert_eq!(seq.contains(vec![Nucleotide::G, Nucleotide::A]), true);
assert_eq!(
seq.contains(vec![Nucleotide::C, Nucleotide::G, Nucleotide::A]),
true
);
assert_eq!(
seq.contains(vec![Nucleotide::G, Nucleotide::A, Nucleotide::C]),
false
);
}
#[test]
#[should_panic]
fn test_contains_fails() {
let s = "ATGCGA";
let seq = Sequence::from_str(&s).unwrap();
assert_eq!(seq.contains(vec![]), true);
}
}