gtfsort 0.2.4

An optimized chr/pos/feature GTF/GFF sorter using a lexicographic-based index ordering algorithm written in Rust.
Documentation
mod attr;
use std::borrow::Cow;

pub use attr::*;

#[derive(Debug, PartialEq, Eq, Ord, PartialOrd, Clone, Copy)]
pub struct Record<'a> {
    pub line_no: usize,
    pub chrom: &'a str,
    pub feat: &'a str,
    pub start: u32,
    pub end: u32,
    pub gene_id: &'a str,
    pub transcript_id: &'a str,
    pub exon_number: &'a str,
    pub line: &'a str,
}

impl<'a> Record<'a> {
    /// Parses a single tab-delimited GTF/GFF line into a borrowed record view.
    #[inline]
    pub fn parse<const SEP: u8>(line_no: usize, line: &'a str) -> Result<Self, Cow<'static, str>> {
        if line.is_empty() {
            return Err("Empty line".into());
        }

        let mut fields = line.split('\t');
        let (chrom, _, feat, start, end, _, _, _, attrs_str) = (
            fields.next().ok_or("Missing chrom")?,
            fields.next().ok_or("Missing source")?,
            fields.next().ok_or("Missing feature")?,
            fields.next().ok_or("Missing start")?,
            fields.next().ok_or("Missing end")?,
            fields.next().ok_or("Missing score")?,
            fields.next().ok_or("Missing strand")?,
            fields.next().ok_or("Missing frame")?,
            fields.next().ok_or("Missing attributes")?,
        );

        let attributes = Attribute::parse::<SEP>(attrs_str).map_err(|e| e.to_string())?;

        Ok(Self {
            line_no,
            chrom,
            feat,
            start: start.parse().map_err(|_| "Invalid start")?,
            end: end.parse().map_err(|_| "Invalid end")?,
            gene_id: attributes.gene_id(),
            transcript_id: attributes.transcript_id(),
            exon_number: attributes.exon_number(),
            line,
        })
    }

    /// Returns the outer sorting key for gene feature lines.
    #[inline(always)]
    pub fn outer_layer(&self) -> (u32, &'a str, &'a str) {
        (self.start, self.gene_id, self.line)
    }

    /// Returns the legacy exon/codon ordering key for transcript child features.
    #[inline(always)]
    pub fn inner_layer(&self) -> (&'a str, char) {
        (
            self.exon_number,
            match self.feat {
                "exon" => 'a',
                "CDS" => 'b',
                "start_codon" => 'c',
                "stop_codon" => 'd',
                _ => 'e',
            },
        )
    }

    /// Returns true when the record represents a gene feature row.
    #[inline(always)]
    pub fn is_gene(&self) -> bool {
        self.feat == "gene"
    }

    /// Returns true when the record represents a transcript feature row.
    #[inline(always)]
    pub fn is_transcript(&self) -> bool {
        self.feat == "transcript"
    }

    /// Returns true when the record belongs to a transcript block.
    #[inline(always)]
    pub fn has_transcript(&self) -> bool {
        self.transcript_id != "0"
    }
}

mod tests {
    #[allow(unused_imports)]
    use super::*;

    #[test]
    fn valid_record() {
        let line = "1\thavana\tCDS\t2408530\t2408619\t.\t-\t0\tgene_id \"ENSG00000157911\"; gene_version \"11\"; transcript_id \"ENST00000508384\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"PEX10\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"PEX10-205\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000464289\"; protein_version \"1\"; tag \"cds_end_NF\"; tag \"mRNA_end_NF\"; transcript_support_level \"3\";".to_string();
        let result = Record::parse::<b' '>(0, &line);

        assert!(result.is_ok());

        let record = result.unwrap();
        assert_eq!(record.chrom, "1");
        assert_eq!(record.feat, "CDS");
        assert_eq!(record.start, 2408530);
        assert_eq!(record.gene_id, "ENSG00000157911");
        assert_eq!(record.transcript_id, "ENST00000508384");
        assert_eq!(record.exon_number, "3");
        assert_eq!(record.line, line);
    }

    #[test]
    fn empty_record() {
        let line = "".to_string();
        let result = Record::parse::<b' '>(0, &line);

        assert!(result.is_err());
        assert_eq!(result.unwrap_err(), "Empty line");
    }

    #[test]
    fn outer_layer() {
        let line = "1\thavana\tCDS\t2408530\t2408619\t.\t-\t0\tgene_id \"ENSG00000157911\"; gene_version \"11\"; transcript_id \"ENST00000508384\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"PEX10\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"PEX10-205\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000464289\"; protein_version \"1\"; tag \"cds_end_NF\"; tag \"mRNA_end_NF\"; transcript_support_level \"3\";".to_string();
        let record = Record::parse::<b' '>(0, &line).unwrap();
        let (start, gene_id, line) = record.outer_layer();

        assert_eq!(start, 2408530);
        assert_eq!(gene_id, "ENSG00000157911");
        assert_eq!(line, "1\thavana\tCDS\t2408530\t2408619\t.\t-\t0\tgene_id \"ENSG00000157911\"; gene_version \"11\"; transcript_id \"ENST00000508384\"; transcript_version \"5\"; exon_number \"3\"; gene_name \"PEX10\"; gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\"; transcript_name \"PEX10-205\"; transcript_source \"havana\"; transcript_biotype \"protein_coding\"; protein_id \"ENSP00000464289\"; protein_version \"1\"; tag \"cds_end_NF\"; tag \"mRNA_end_NF\"; transcript_support_level \"3\";");
    }
}