noel 0.2.1

A GTF/GFF per gene non-overlapping exon length calculator
Documentation
#![allow(dead_code)]
use std::collections::HashMap;
use thiserror::Error;

const SEP: &str = ".";

#[derive(Debug, PartialEq)]
pub struct Attribute {
    gene_id: String,
}

impl Attribute {
    pub fn parse(line: &str) -> Result<Attribute, NoelError> {
        if !line.is_empty() {
            let mut attributes: HashMap<String, String> = HashMap::new();
            let bytes = line.as_bytes().iter().enumerate();

            let mut start = 0;
            for (i, byte) in bytes {
                if *byte == b';' {
                    let word = &line[start..i];
                    if !word.is_empty() && word.starts_with("gene_id") {
                        let (key, value) = get_pair(word).ok_or(NoelError::Parse)?;
                        attributes.insert(key, value);
                    }
                    start = i + 1;
                }
            }

            let gene_id = attributes.get("gene_id").ok_or(NoelError::Invalid);

            Ok(Attribute {
                gene_id: gene_id?.to_string(),
            })
        } else {
            Err(NoelError::Empty)
        }
    }

    pub fn gene_id(&self) -> &str {
        &self.gene_id
    }
}

fn get_pair(line: &str) -> Option<(String, String)> {
    let mut bytes = line.as_bytes().iter();
    let i = if let Some(pos) = bytes.position(|b| *b == b' ' || *b == b'=') {
        pos
    } else {
        line.len()
    };

    let key = &line[..i];
    let value = get_gene(*&line[i + 1..].trim_matches('"'), &SEP)?;

    Some((key.to_string(), value.to_string()))
}

fn get_gene(gene: &str, sep: &str) -> Option<String> {
    let bytes = gene.as_bytes().iter().enumerate();
    let start = 0;
    let mut word = String::new();

    for (i, byte) in bytes {
        if *byte == sep.as_bytes()[0] {
            word.push_str(&gene[start..i]);
        }
    }
    if !word.is_empty() {
        return Some(word);
    } else {
        return Some(gene.to_string());
    }
}

#[derive(Error, Debug, PartialEq)]
pub enum NoelError {
    #[error("Empty line")]
    Empty,
    #[error("Invalid GTF line")]
    Invalid,
    #[error("Not an exon")]
    NoExon,
    #[error("Parsing error")]
    Parse,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn valid_gtf_attributes() {
        let input = "gene_id \"ABC\"; transcript_id \"XYZ\"; exon_number \"1\"; exon_id \"123\";"
            .to_string();
        let attr = Attribute::parse(&input).unwrap();

        assert_eq!(attr.gene_id(), "ABC");
    }

    #[test]
    fn valid_gff_attributes() {
        let input = "ID=exon:ENST00000456328.2.1;Parent=ENST00000456328.2;gene_id=ENSG00000290825.1;transcript_id=ENST00000456328.2,exon_number=1".to_string();
        let attr = Attribute::parse(&input).unwrap();

        assert_eq!(attr.gene_id(), "ENSG00000290825");
    }

    #[test]
    fn invalid_attributes() {
        let input = "transcript_id \"XYZ\"; exon_number \"1\";".to_string();
        let result = Attribute::parse(&input);

        assert_eq!(result.unwrap_err(), NoelError::Invalid);
    }

    #[test]
    fn valid_gene() {
        let gene = "ENSG00000290825.1".to_string();
        let result = get_gene(&gene, &SEP);
        assert_eq!(result.unwrap(), "ENSG00000290825");
    }

    #[test]
    fn invalid_gene() {
        let gene = "ENSG00000290825.1".to_string();
        let sep = ".".to_string();
        let result = get_gene(&gene, &sep);
        assert_ne!(result.unwrap(), "ENSG00000290825.1");
    }

    #[test]
    fn get_gff_pair() {
        let line = "gene_id=ENSG00000290825.1";
        let tup = ("gene_id".to_string(), "ENSG00000290825.1".to_string());

        let pair = get_pair(line).unwrap();
        assert_eq!(tup, pair);
    }
}