atglib 0.2.0

A library to handle transcripts for genomics and transcriptomics
Documentation
use std::fmt;

use crate::models::sequence::Nucleotide;
use crate::models::AminoAcid;
use crate::utils::errors::AtgError;

/// The genetic code lookup table is an Array with 64 amino acids
/// The lookup occurs through defined calculations:
/// T => positions 0-15
/// C => posisions 16-31
/// A => positions 32-47
/// G => positions 48-63
/// https://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQFEAT.HTML
type GeneticCodeLookup = [AminoAcid; 64];

/// The genetic code is basically a lookup table from DNA codons to AminoAcids
///
/// # Note
/// The genetic code does not support alternative start codons. *ATGlib* considers only `ATG` as start codon.
///
/// # Examples
/// ```
/// use atglib::models::{AminoAcid, GeneticCode, Nucleotide};
/// let code = GeneticCode::default();
/// assert_eq!(
///     code.translate(&[Nucleotide::A, Nucleotide::T, Nucleotide::G])
///         .unwrap(),
///     AminoAcid::M
/// );
/// ```
#[derive(Debug)]
pub struct GeneticCode {
    code: GeneticCodeLookup,
}

impl Default for GeneticCode {
    /// Crates the [standard genetic code](https://en.wikipedia.org/wiki/DNA_codon_table)
    fn default() -> GeneticCode {
        GeneticCode {
            code: "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"
                .chars()
                .map(|c| c.try_into().unwrap()) // cannot fail
                .collect::<Vec<AminoAcid>>()
                .try_into()
                .unwrap(), // cannot fail
        }
    }
}

impl PartialEq for GeneticCode {
    fn eq(&self, other: &Self) -> bool {
        self.code == other.code
    }
}
impl Eq for GeneticCode {}

impl fmt::Display for GeneticCode {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(
            f,
            "{}",
            self.code
                .iter()
                .map(|aa| aa.single_letter())
                .collect::<String>()
        )
    }
}

impl GeneticCode {
    /// Creates a new, custom genetic code
    ///
    /// The aa_table must be the amino acid translation for each possible codon, in the same order
    /// as the [NCBI genetic code tables](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt)
    ///
    /// The genetic code translation is provided as an array with 64 elements (one for each codon)
    /// # Examples
    /// ```
    /// use atglib::models::{AminoAcid, GeneticCode, Nucleotide};
    /// // use the genetic code table for `Yeast Mitochondrial`
    /// let code = GeneticCode::new("FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG").unwrap();
    /// assert_eq!(
    ///     code.translate(&[Nucleotide::T, Nucleotide::G, Nucleotide::A])
    ///         .unwrap(),
    ///     AminoAcid::W
    /// );
    /// ```
    pub fn new(aa_table: &str) -> Result<GeneticCode, AtgError> {
        if aa_table.len() != 64 {
            return Err(AtgError::new(
                "aa_table has wrong length. 64 amino acids are required",
            ));
        }

        let allowed_chars = "*ACDEFGHIKLMNPQRSTVWY";

        for letter in aa_table.chars() {
            if !allowed_chars.contains(letter) {
                return Err(AtgError::new(format!("Invalid amino acid {}", letter)));
            }
        }
        Ok(GeneticCode {
            code: aa_table
                .chars()
                .map(|c| c.try_into().unwrap()) // cannot fail
                .collect::<Vec<AminoAcid>>()
                .try_into()
                .unwrap(), // cannot fail
        })
    }

    /// Creates the genetic code of [vertrebrate mitochondrial](https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code)
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{AminoAcid, GeneticCode, Nucleotide};
    /// let code = GeneticCode::vertebrate_mitochondrial();
    /// assert_eq!(
    ///     code.translate(&[Nucleotide::A, Nucleotide::G, Nucleotide::A])
    ///         .unwrap(),
    ///     AminoAcid::Ter
    /// );
    /// ```
    pub fn vertebrate_mitochondrial() -> GeneticCode {
        GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG")
            .unwrap()
    }

    /// Creates the genetic code of [yeast mitochondrial](https://en.wikipedia.org/wiki/Yeast_mitochondrial_code)
    pub fn yeast_mitochondrial() -> GeneticCode {
        GeneticCode::new("FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG")
            .unwrap()
    }

    /// Creates the genetic code of [invertebrate mitochondrial](https://en.wikipedia.org/wiki/Invertebrate_mitochondrial_code)
    pub fn invertebrate_mitochondrial() -> GeneticCode {
        GeneticCode::new("FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG")
            .unwrap()
    }

    /// Tries to create a genetic code from either a known code or from the provided lookup
    ///
    /// This method can be used as a helper function for user-facing applications where users
    /// can provide either the name of the code to use (e.g. `vertebrate_mitochondria`) or provide the
    /// actual lookup table (e.g. `FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG`)
    ///
    /// The names are based on the `name` field from the [NCBI specs](https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/data/gc.prt)
    /// but all lowercase characters. For example `"Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma"`
    /// ```text
    /// ...
    /// {
    /// name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" ,
    /// name "SGC3" ,
    /// id 4 ,
    /// ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    /// sncbieaa "--MM------**-------M------------MMMM---------------M------------"
    /// -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
    /// -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
    /// -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
    /// },
    /// ...
    /// ```
    /// can be called as `GeneticCode::guess("mold mitochondrial; protozoan mitochondrial; coelenterate mitochondrial; mycoplasma; spiroplasma")`
    /// or `GeneticCode::guess("sgc3")`
    ///
    /// # Examples
    /// ```
    /// use atglib::models::GeneticCode;
    ///
    /// assert_eq!(
    ///     GeneticCode::vertebrate_mitochondrial(),
    ///     GeneticCode::guess("vertebrate mitochondrial").unwrap()
    /// );
    ///
    /// assert_eq!(
    ///     GeneticCode::vertebrate_mitochondrial(),
    ///     GeneticCode::guess("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG").unwrap()
    /// );
    /// ```
    pub fn guess(code: &str) -> Result<GeneticCode, AtgError> {
        match code {
            "standard" | "default" | "sgc0" => Ok(GeneticCode::default()),
            "vertebrate mitochondrial" | "sgc1" => Ok(GeneticCode::vertebrate_mitochondrial()),
            "yeast mitochondrial" | "sgc2" => Ok(GeneticCode::yeast_mitochondrial()),
            "mold mitochondrial; protozoan mitochondrial; coelenterate mitochondrial; mycoplasma; spiroplasma" | "sgc3" => GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "invertebrate_mitochondrial" | "sgc4" => Ok(GeneticCode::invertebrate_mitochondrial()),
            "ciliate nuclear; dasycladacean nuclear; hexamita nuclear" | "sgc5" => GeneticCode::new("FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "echinoderm mitochondrial; flatworm mitochondrial" | "sgc8" => GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"),
            "euplotid nuclear" | "sgc9" => GeneticCode::new("FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "bacterial, archaeal and plant plastid" => GeneticCode::new("FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "alternative yeast nuclear" => GeneticCode::new("FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "ascidian mitochondrial" => GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG"),
            "alternative flatworm mitochondrial" => GeneticCode::new("FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"),
            "blepharisma macronuclear" => GeneticCode::new("FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "chlorophycean mitochondrial" => GeneticCode::new("FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "trematode mitochondrial" => GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG"),
            "scenedesmus obliquus mitochondrial" => GeneticCode::new("FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "thraustochytrium mitochondrial" => GeneticCode::new("FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "rhabdopleuridae mitochondrial" => GeneticCode::new("FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG"),
            "candidate division sr1 and gracilibacteria" => GeneticCode::new("FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "pachysolen tannophilus nuclear" => GeneticCode::new("FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "karyorelict nuclear" => GeneticCode::new("FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "condylostoma nuclear" => GeneticCode::new("FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "mesodinium nuclear" => GeneticCode::new("FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "peritrich nuclear" => GeneticCode::new("FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "blastocrithidia nuclear" => GeneticCode::new("FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "balanophoraceae plastid" => GeneticCode::new("FFLLSSSSYY*WCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"),
            "cephalodiscidae mitochondrial" => GeneticCode::new("FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG"),
            _ => match GeneticCode::new(code) {
                Ok(code) => Ok(code),
                Err(_) => Err(AtgError::new("Genetic code not known or invalid")),
            },
        }
    }

    /// Translates a provided codon into an AminoAcid
    ///
    /// If the codon contains an `N` nucleotide, the method will return `AtgError`
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{AminoAcid, GeneticCode, Nucleotide};
    /// let code = GeneticCode::default();
    /// let aa = code.translate(&[Nucleotide::A, Nucleotide::T, Nucleotide::G]).unwrap();
    /// assert_eq!(aa, AminoAcid::M);
    /// ```
    pub fn translate(&self, codon: &[Nucleotide; 3]) -> Result<AminoAcid, AtgError> {
        let first_offset = codon[0].as_ncbi_int()? * 16;
        let second_offset = codon[1].as_ncbi_int()? * 4;
        let third_offset = codon[2].as_ncbi_int()?;
        Ok(self.code[first_offset + second_offset + third_offset])
    }

    /// Returns a vector of all codons that code for the provided AminoAcid
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{AminoAcid, GeneticCode, Nucleotide};
    /// let code = GeneticCode::default();
    /// let codons = code.reverse_lookup(&AminoAcid::M);
    /// assert_eq!(
    ///     codons[0],
    ///     [Nucleotide::A, Nucleotide::T, Nucleotide::G]
    /// );
    /// ```
    pub fn reverse_lookup(&self, aa: &AminoAcid) -> Vec<[Nucleotide; 3]> {
        self.code
            .iter()
            .enumerate()
            .filter(|(_, elmt)| *elmt == aa)
            .map(|(idx, _)| {
                // `idx` cannot be larger than 63 (len of `self.code`): `pos1` must be 0 <= 3
                let (pos1, remainder) = (idx / 16, idx % 16);
                // `remainder` cannot be larger than 15: `pos2` must be 0 <= 3
                let (pos2, pos3) = (remainder / 4, remainder % 4);
                [
                    Nucleotide::try_from(pos1 as u8).unwrap(), // cannot fail, due to 0 <= pos <= 3 conditions, described above
                    Nucleotide::try_from(pos2 as u8).unwrap(), // cannot fail, due to 0 <= pos <= 3 conditions, described above
                    Nucleotide::try_from(pos3 as u8).unwrap(), // cannot fail, due to 0 <= pos <= 3 conditions, described above
                ]
            })
            .collect::<Vec<[Nucleotide; 3]>>()
    }

    /// Returns all possible Stop codons
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{GeneticCode, Nucleotide};
    /// let code = GeneticCode::default();
    /// let codons = code.stop_codons();
    /// assert!(codons.contains(&[Nucleotide::T, Nucleotide::A, Nucleotide::A]));
    /// ```
    pub fn stop_codons(&self) -> Vec<[Nucleotide; 3]> {
        self.reverse_lookup(&AminoAcid::Ter)
    }

    /// Returns true if the provided codon is a stop codon
    ///
    /// This method can only handle codons with exaccly 3 nucleotides. All other input will return `false`
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{GeneticCode, Nucleotide};
    /// let code = GeneticCode::default();
    /// assert!(code.is_stop_codon(&[Nucleotide::T, Nucleotide::A, Nucleotide::A]));
    /// ```
    pub fn is_stop_codon(&self, codon: &[Nucleotide]) -> bool {
        if codon.len() != 3 {
            return false;
        }

        matches!(
            self.translate(&[codon[0], codon[1], codon[2]]),
            Ok(AminoAcid::Ter)
        )
    }

    /// Returns `true` if the codon is a start codon
    ///
    /// This method considers only the canonical `ATG` start codon and does
    /// not include non-standard start codons
    ///
    /// # Examples
    /// ```
    /// use atglib::models::{GeneticCode, Nucleotide};
    /// assert!(GeneticCode::is_start_codon(&[Nucleotide::A, Nucleotide::T, Nucleotide::G]));
    /// ```
    pub fn is_start_codon(codon: &[Nucleotide]) -> bool {
        codon == [Nucleotide::A, Nucleotide::T, Nucleotide::G]
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_transcript() {
        let code = GeneticCode::default();
        assert_eq!(
            code.translate(&[Nucleotide::A, Nucleotide::T, Nucleotide::G])
                .unwrap()
                .as_ref(),
            "Met"
        );
        assert_eq!(
            code.translate(&[Nucleotide::T, Nucleotide::T, Nucleotide::T])
                .unwrap()
                .as_ref(),
            "Phe"
        );
        assert_eq!(
            code.translate(&[Nucleotide::T, Nucleotide::A, Nucleotide::A])
                .unwrap()
                .as_ref(),
            "Ter"
        );
        assert_eq!(
            code.translate(&[Nucleotide::T, Nucleotide::A, Nucleotide::G])
                .unwrap()
                .as_ref(),
            "Ter"
        );
    }

    #[test]
    fn test_reverse_lookup() {
        let code = GeneticCode::default();
        for c in "ACDEFGHIKLMNPQRSTVWY*".chars() {
            let aa = AminoAcid::try_from(c).unwrap();
            for nt in code.reverse_lookup(&aa) {
                assert_eq!(aa, code.translate(&nt).unwrap());
            }
        }
    }

    #[test]
    fn test_stop_codon() {
        let code = GeneticCode::default();
        let ter = code.stop_codons();
        assert_eq!(ter.len(), 3);
        assert!(ter.contains(&[Nucleotide::T, Nucleotide::A, Nucleotide::A]));
        assert!(ter.contains(&[Nucleotide::T, Nucleotide::A, Nucleotide::G]));
        assert!(ter.contains(&[Nucleotide::T, Nucleotide::G, Nucleotide::A]));

        assert!(!ter.contains(&[Nucleotide::A, Nucleotide::G, Nucleotide::A]));
        assert!(!ter.contains(&[Nucleotide::A, Nucleotide::G, Nucleotide::G]));
    }

    #[test]
    fn test_mito_stop_codon() {
        let mito_code = GeneticCode::vertebrate_mitochondrial();
        let mito_ter = mito_code.stop_codons();
        assert_eq!(mito_ter.len(), 4);
        assert!(mito_ter.contains(&[Nucleotide::T, Nucleotide::A, Nucleotide::A]));
        assert!(mito_ter.contains(&[Nucleotide::T, Nucleotide::A, Nucleotide::G]));
        assert!(mito_ter.contains(&[Nucleotide::A, Nucleotide::G, Nucleotide::A]));
        assert!(mito_ter.contains(&[Nucleotide::A, Nucleotide::G, Nucleotide::G]));

        assert!(!mito_ter.contains(&[Nucleotide::T, Nucleotide::G, Nucleotide::A]));
    }

    #[test]
    fn test_start_codon() {
        let seq = vec![
            Nucleotide::C,
            Nucleotide::A,
            Nucleotide::T,
            Nucleotide::G,
            Nucleotide::T,
        ];
        assert!(GeneticCode::is_start_codon(&[
            Nucleotide::A,
            Nucleotide::T,
            Nucleotide::G,
        ]));
        assert!(GeneticCode::is_start_codon(&seq[1..4]));
        assert!(!GeneticCode::is_start_codon(&seq[0..3]));
        assert!(!GeneticCode::is_start_codon(&seq[1..5]));
        assert!(!GeneticCode::is_start_codon(&seq));
    }
}