gtftools 0.1.9

A barebones GTF toolkit with fast nom-based IO
Documentation
use super::{GtfReader, GtfRecord};
use anyhow::{bail, Result};
use bstr::ByteSlice;
use std::{
    fs::File,
    io::{BufRead, BufReader, Read},
};

pub struct Gtf {
    records: Vec<GtfRecord>,
}
impl Gtf {
    pub fn from_file(path: &str) -> Result<Self> {
        let file = File::open(path)?;
        Self::from_reader(file)
    }

    pub fn from_reader<R: Read>(reader: R) -> Result<Self> {
        let buffer = BufReader::new(reader);
        Self::from_bufread(buffer)
    }

    pub fn from_bufread<B: BufRead>(buffer: B) -> Result<Self> {
        let reader = GtfReader::from_bufread(buffer);
        Self::parse_records(reader)
    }

    fn parse_records<B: BufRead>(reader: GtfReader<B>) -> Result<Self> {
        let records = reader
            .into_iter()
            .filter_map(|x| x.ok())
            .collect::<Vec<GtfRecord>>();

        if records.len() == 0 {
            bail!("No records found in reader")
        } else {
            Ok(Self { records })
        }
    }

    pub fn len(&self) -> usize {
        self.records.len()
    }

    pub fn feature_subset(&self, feature_name: &str) -> Self {
        let records = self
            .records
            .iter()
            .filter(|x| x.feature.contains_str(feature_name))
            .map(|x| x.to_owned())
            .collect();
        Self { records }
    }

    pub fn genes(&self) -> Self {
        self.feature_subset("gene")
    }

    pub fn transcripts(&self) -> Self {
        self.feature_subset("transcript")
    }

    pub fn exons(&self) -> Self {
        self.feature_subset("exon")
    }
}

#[cfg(test)]
mod testing {
    use std::io::Cursor;

    use super::Gtf;

    fn example_gtf() -> String {
        let gtf_raw = r##"#!genome-build GRCh38.p13
                #!genome-version GRCh38
                #!genome-date 2013-12
                #!genome-build-accession GCA_000001405.28
                #!genebuild-last-updated 2022-04
                1	ensembl_havana	gene	1471765	1497848	.	+	.	gene_id "ENSG00000160072"; gene_version "20"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
                1	ensembl_havana	transcript	1471765	1497848	.	+	.	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic";
                1	ensembl_havana	exon	1471765	1472089	.	+	.	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003889014"; exon_version "1"; tag "basic";
                1	ensembl_havana	exon	1471765	1472089	.	+	.	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003889014"; exon_version "1"; tag "basic";
                1	ensembl_havana	exon	1471765	1472089	.	+	.	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; exon_id "ENSE00003889014"; exon_version "1"; tag "basic";
                1	ensembl_havana	CDS	1471885	1472089	.	+	0	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; protein_id "ENSP00000500094"; protein_version "1"; tag "basic";
                1	ensembl_havana	start_codon	1471885	1471887	.	+	0	gene_id "ENSG00000160072"; gene_version "20"; transcript_id "ENST00000673477"; transcript_version "1"; exon_number "1"; gene_name "ATAD3B"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "ATAD3B-206"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30"; tag "basic";"##;
        gtf_raw.to_string()
    }

    #[test]
    fn test_gtf_from_file() {
        let path = "examples/data/example.gtf";
        let gtf = Gtf::from_file(path).unwrap();
        assert_eq!(gtf.len(), 5);
    }

    #[test]
    fn test_gtf_from_reader() {
        let gtf_str = example_gtf();
        let cursor = Cursor::new(gtf_str);
        let gtf = Gtf::from_reader(cursor).unwrap();
        assert_eq!(gtf.len(), 7);
    }

    #[test]
    fn test_subset_genes() {
        let path = "examples/data/example.gtf";
        let gtf = Gtf::from_file(path).unwrap();
        let subset = gtf.genes();
        assert_eq!(subset.len(), 1);
    }

    #[test]
    fn test_subset_transcripts() {
        let path = "examples/data/example.gtf";
        let gtf = Gtf::from_file(path).unwrap();
        let subset = gtf.transcripts();
        assert_eq!(subset.len(), 1);
    }

    #[test]
    fn test_subset_exons() {
        let gtf_str = example_gtf();
        let cursor = Cursor::new(gtf_str);
        let gtf = Gtf::from_reader(cursor).unwrap();
        let subset = gtf.exons();
        assert_eq!(subset.len(), 3);
    }
}