atglib 0.2.0

A library to handle transcripts for genomics and transcriptomics
Documentation
//! This module contains some standard transcripts to use for testing
//!
//! This simplifies writing unit- and integration tests.
//! Some transcripts in here have been choosen deliberately, because they
//! contain some edge cases that I tripped across during development.

use crate::models;
use crate::models::{Exon, Transcript, TranscriptBuilder};

/// NM_001365057.2
///
/// The last exon contains the last nucleotide of the stop codon.
/// Thus the exon is considered `non-coding` in GTF specs, but RefGene requires
/// a frame-offset value for the exon regardless.
pub fn nm_001365057() -> Transcript {
    let mut transcript = TranscriptBuilder::new()
        .name("NM_001365057.2")
        .chrom("chr9")
        .gene("C9orf85")
        .strand(models::Strand::Plus)
        .cds_start_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .cds_stop_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .build()
        .unwrap();

    transcript.push_exon(Exon::new(
        74526555,
        74526752,
        Some(74526651),
        Some(74526752),
        models::Frame::Zero,
    ));

    transcript.push_exon(Exon::new(
        74561922,
        74562028,
        Some(74561922),
        Some(74562028),
        models::Frame::Zero,
    ));

    transcript.push_exon(Exon::new(
        74597573,
        74600974,
        Some(74597573),
        Some(74597573),
        models::Frame::One,
    ));

    transcript
}

/// NM_001365408.1
///
/// The start codon is split across two exons.
pub fn nm_001365408() -> Transcript {
    let mut transcript = TranscriptBuilder::new()
        .name("NM_001365408.1")
        .chrom("chr16")
        .gene("CES2")
        .strand(models::Strand::Plus)
        .cds_start_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .cds_stop_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .build()
        .unwrap();

    transcript.push_exon(Exon::new(
        66969419,
        66969778,
        None,
        None,
        models::Frame::None,
    ));
    transcript.push_exon(Exon::new(
        66971940,
        66972144,
        Some(66972143),
        Some(66972144),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66973120,
        66973261,
        Some(66973120),
        Some(66973261),
        models::Frame::One,
    ));
    transcript.push_exon(Exon::new(
        66974125,
        66974258,
        Some(66974125),
        Some(66974258),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66974340,
        66974598,
        Some(66974340),
        Some(66974598),
        models::Frame::One,
    ));
    transcript.push_exon(Exon::new(
        66975027,
        66975125,
        Some(66975027),
        Some(66975125),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66975409,
        66975549,
        Some(66975409),
        Some(66975549),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66975671,
        66975751,
        Some(66975671),
        Some(66975751),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66976008,
        66976152,
        Some(66976008),
        Some(66976152),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        66976551,
        66976640,
        Some(66976551),
        Some(66976640),
        models::Frame::Two,
    ));
    transcript.push_exon(Exon::new(
        66977202,
        66977274,
        Some(66977202),
        Some(66977274),
        models::Frame::Two,
    ));
    transcript.push_exon(Exon::new(
        66977742,
        66978999,
        Some(66977742),
        Some(66977928),
        models::Frame::One,
    ));

    transcript
}

/// NM_001371720.1
///
/// This transcript contains two book-ended exons.
/// This is causing issues in GTF parsing.
/// To help fixing the issue, this function requires an extra
/// boolean parameter to indicate if it should be parsed as GTF.
/// This should be removed once GTF parsing is fixed.
pub fn nm_001371720(gtf: bool) -> Transcript {
    let mut transcript = TranscriptBuilder::new()
        .name("NM_001371720.1")
        .chrom("chr1")
        .gene("MUC1")
        .strand(models::Strand::Minus)
        .cds_start_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .cds_stop_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .build()
        .unwrap();

    transcript.push_exon(Exon::new(
        155158300,
        155158685,
        Some(155158611),
        Some(155158685),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        155159701,
        155159850,
        Some(155159701),
        Some(155159850),
        models::Frame::Zero,
    ));
    transcript.push_exon(Exon::new(
        155159931,
        155160052,
        Some(155159931),
        Some(155160052),
        models::Frame::Two,
    ));
    transcript.push_exon(Exon::new(
        155160198,
        155160334,
        Some(155160198),
        Some(155160334),
        models::Frame::One,
    ));
    transcript.push_exon(Exon::new(
        155160484,
        155160539,
        Some(155160484),
        Some(155160539),
        models::Frame::Zero,
    ));

    if gtf {
        transcript.push_exon(Exon::new(
            155160639,
            155162101,
            Some(155160639),
            Some(155162101),
            models::Frame::Two,
        ));
    } else {
        transcript.push_exon(Exon::new(
            155160639,
            155161619,
            Some(155160639),
            Some(155161619),
            models::Frame::Zero,
        ));
        transcript.push_exon(Exon::new(
            155161620,
            155162101,
            Some(155161620),
            Some(155162101),
            models::Frame::Two,
        ));
    }

    transcript.push_exon(Exon::new(
        155162577,
        155162700,
        Some(155162577),
        Some(155162634),
        models::Frame::Zero,
    ));
    transcript
}

/// NM_201550.4
///
/// Single exon transcript
pub fn nm_201550() -> Transcript {
    let mut transcript = TranscriptBuilder::new()
        .name("NM_201550.4")
        .chrom("chr12")
        .gene("LRRC10")
        .strand(models::Strand::Minus)
        .cds_start_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .cds_stop_codon_stat(models::CdsStat::Complete)
        .unwrap()
        .build()
        .unwrap();

    transcript.push_exon(Exon::new(
        70002344,
        70004687,
        Some(70003785),
        Some(70004618),
        models::Frame::Zero,
    ));

    transcript
}

/// Generates a transcript to be used for tests.
/// It contains 5 exons, 3 of which are coding
///
/// ```text
/// Nucleotide positions:
///
///    1....     2....     3....     4....     5....
///    12345     12345     12345     12345     12345
/// ---=====-----===XX-----XXXXX-----XXXX=-----=====---
///    CACGGGGAAATGGATGGACTGCCCAGTAGCCTGAGGACACAGGGG
///
/// ---  Intron
/// ===  Exon (non-coding)
/// XXX  CDS
/// ATGC DNA sequence if small.fasta is used as reference-genome
/// ```
///
pub fn standard_transcript() -> models::Transcript {
    let mut transcript = models::TranscriptBuilder::new()
        .name("Test-Transcript")
        .chrom("chr1")
        .strand(models::Strand::Plus)
        .gene("Test-Gene")
        .cds_start_stat(models::CdsStat::None)
        .cds_end_stat(models::CdsStat::None)
        .build()
        .unwrap();

    transcript.append_exons(&mut exons());
    transcript
}

fn exons() -> Vec<models::Exon> {
    vec![
        models::Exon::new(11, 15, None, None, models::Frame::None),
        models::Exon::new(21, 25, Some(24), Some(25), models::Frame::Zero),
        models::Exon::new(31, 35, Some(31), Some(35), models::Frame::One),
        models::Exon::new(41, 45, Some(41), Some(44), models::Frame::Two),
        models::Exon::new(51, 55, None, None, models::Frame::None),
    ]
}

/// This is the sequence of chrM in the small.fasta file
/// 1                              32 35     42                                                     97 100   106  111 115
/// AATCTTGGCCTGGGCCAAGGAGACCTTCTCTCCAATGGCCTGCACCTGGCTTGCCAGGGCCGATCTTGGTGCCATCCAGGGGGCCTCTACAAGGATAATCTGACCTGCAGGGTCGAG...
/// -------------------------------XXXATGGCCTG------------------------------------------------------AATC-----TGCAGGXXXX
pub fn mito_transcript() -> models::Transcript {
    let mut transcript = models::TranscriptBuilder::new()
        .name("Test-Mito-Transcript")
        .chrom("chrM")
        .strand(models::Strand::Plus)
        .gene("Test-Mito-Gene")
        .cds_start_stat(models::CdsStat::None)
        .cds_end_stat(models::CdsStat::None)
        .build()
        .unwrap();

    let mut exons = vec![
        models::Exon::new(32, 42, Some(35), Some(42), models::Frame::Zero),
        models::Exon::new(97, 100, Some(97), Some(100), models::Frame::Two),
        models::Exon::new(106, 115, Some(106), Some(111), models::Frame::Zero),
    ];

    transcript.append_exons(&mut exons);
    transcript
}

#[cfg(test)]
mod tests {
    use super::*;

    use crate::fasta::FastaReader;
    use crate::models::{Sequence, Strand};

    #[test]
    fn test_mito_transcript() {
        let tx = mito_transcript();
        let mut fasta_reader = FastaReader::from_file("tests/data/small.fasta").unwrap();

        let seq =
            Sequence::from_coordinates(&tx.cds_coordinates(), &Strand::Plus, &mut fasta_reader)
                .unwrap();
        assert_eq!(seq.to_string(), "ATGGCCTGAATCTGCAGG".to_string());

        let seq =
            Sequence::from_coordinates(&tx.exon_coordinates(), &Strand::Plus, &mut fasta_reader)
                .unwrap();
        assert_eq!(seq.to_string(), "CCAATGGCCTGAATCTGCAGGGTCG".to_string());

        let coords = vec![(tx.chrom(), tx.tx_start(), tx.tx_end())];
        let seq = Sequence::from_coordinates(&coords, &Strand::Plus, &mut fasta_reader).unwrap();
        assert_eq!(
            seq.to_string(),
            "CCAATGGCCTGCACCTGGCTTGCCAGGGCCGATCTTGGTGCCATCCAGGGGGCCTCTACAAGGATAATCTGACCTGCAGGGTCG"
                .to_string()
        );
    }
}