gen 0.1.0

A sequence graph and version control system.
Documentation
use crate::normalize_string;
use crate::operation_management::OperationError;
use gb_io::seq::{Location, Seq};
use regex::{Error as RegexError, Regex};
use std::fmt;
use std::str::{self, FromStr};
use thiserror::Error;

#[derive(Debug, Error, PartialEq)]
pub enum GenBankError {
    #[error("Feature Location Error: {0}")]
    LocationError(&'static str),
    #[error("Parse Error: {0}")]
    ParseError(String),
    #[error("Lookup Error: {0}")]
    LookupError(String),
    #[error("Operation Error: {0}")]
    OperationError(#[from] OperationError),
    #[error("Regex Error: {0}")]
    Regex(#[from] RegexError),
}

#[derive(Copy, Clone, Debug, PartialEq)]
pub enum EditType {
    Deletion,
    Insertion,
    Replacement,
}

impl fmt::Display for EditType {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            EditType::Deletion => write!(f, "Deletion"),
            EditType::Insertion => write!(f, "Insertion"),
            EditType::Replacement => write!(f, "Replacement"),
        }
    }
}

impl FromStr for EditType {
    type Err = GenBankError;

    fn from_str(input: &str) -> Result<EditType, Self::Err> {
        match input {
            "Deletion" => Ok(EditType::Deletion),
            "Insertion" => Ok(EditType::Insertion),
            "Replacement" => Ok(EditType::Replacement),
            _ => Err(Self::Err::ParseError(
                format!("Unknown edit type: {input}").to_string(),
            )),
        }
    }
}

#[derive(Clone, Debug, PartialEq)]
pub struct GenBankEdit {
    pub start: i64,
    pub end: i64,
    pub old_sequence: String,
    pub new_sequence: String,
    pub edit_type: EditType,
}

#[derive(Clone, Debug, PartialEq)]
pub struct GenBankLocus {
    pub name: String,
    pub molecule_type: Option<String>,
    pub sequence: String,
    pub changes: Vec<GenBankEdit>,
}

impl GenBankLocus {
    pub fn original_sequence(&self) -> String {
        let mut final_sequence = self.sequence.clone();
        let mut offset: i64 = 0;
        for edit in self.changes.iter() {
            let ustart = (edit.start + offset) as usize;
            let uend = (edit.end + offset) as usize;
            match edit.edit_type {
                EditType::Insertion => {
                    final_sequence =
                        format!("{}{}", &final_sequence[..ustart], &final_sequence[uend..]);
                }
                EditType::Deletion | EditType::Replacement => {
                    final_sequence = format!(
                        "{}{}{}",
                        &final_sequence[..ustart],
                        edit.old_sequence,
                        &final_sequence[uend..]
                    );
                }
            }
            offset += edit.old_sequence.len() as i64 - edit.new_sequence.len() as i64;
        }
        final_sequence
    }

    pub fn changes_to_wt(&self) -> Vec<GenBankEdit> {
        let mut wt_changes = vec![];
        let mut offset: i64 = 0;
        for edit in self.changes.iter() {
            let seq_diff = edit.old_sequence.len() as i64 - edit.new_sequence.len() as i64;
            wt_changes.push(GenBankEdit {
                start: edit.start + offset,
                end: edit.end + offset + seq_diff,
                old_sequence: edit.old_sequence.clone(),
                new_sequence: edit.new_sequence.clone(),
                edit_type: edit.edit_type,
            });
            offset += seq_diff;
        }
        wt_changes.sort_unstable_by(|a, b| Ord::cmp(&a.start, &b.start));
        wt_changes
    }
}

pub fn process_sequence(seq: Seq) -> Result<GenBankLocus, GenBankError> {
    let final_sequence = if let Ok(sequence) = str::from_utf8(&seq.seq) {
        sequence.to_string()
    } else {
        return Err(GenBankError::ParseError("No sequence present".to_string()));
    };

    let geneious_edit = Regex::new(r"Geneious type: Editing History (?P<edit_type>\w+)")?;
    let mut locus = GenBankLocus {
        name: seq.name.unwrap_or_default(),
        sequence: final_sequence.clone(),
        molecule_type: seq.molecule_type,
        changes: vec![],
    };

    for feature in seq.features.iter() {
        for (key, value) in feature.qualifiers.iter() {
            if key == "note" {
                if let Some(v) = value {
                    let geneious_mod = geneious_edit.captures(v);
                    if let Some(edit) = geneious_mod {
                        let (mut start, mut end) = feature
                            .location
                            .find_bounds()
                            .map_err(|_| GenBankError::LocationError("Ambiguous Bounds"))?;
                        match &edit["edit_type"] {
                            "Insertion" => {
                                // If there is an insertion, it means that the WT is missing
                                // this sequence, so we actually treat it as a deletion
                                locus.changes.push(GenBankEdit {
                                    start,
                                    end,
                                    old_sequence: "".to_string(),
                                    new_sequence: final_sequence[start as usize..end as usize]
                                        .to_string(),
                                    edit_type: EditType::Insertion,
                                });
                            }
                            "Deletion" | "Replacement" => {
                                // If there is a deletion, it means that found sequence is missing
                                // this sequence, so we treat it as an insertion
                                let deleted_seq = normalize_string(
                                    &feature
                                        .qualifiers
                                        .iter()
                                        .filter(|(k, _v)| k == "Original_Bases")
                                        .map(|(_k, v)| v.clone())
                                        .collect::<Option<String>>()
                                        .expect("Deleted sequence is not annotated."),
                                );
                                if matches!(feature.location, Location::Between(_, _)) {
                                    start += 1;
                                    end -= 1;
                                }
                                locus.changes.push(GenBankEdit {
                                    start,
                                    end,
                                    old_sequence: deleted_seq,
                                    new_sequence: final_sequence[start as usize..end as usize]
                                        .to_string(),
                                    edit_type: EditType::from_str(&edit["edit_type"])?,
                                });
                            }
                            t => {
                                println!("Unknown edit type {t}.")
                            }
                        }
                    }
                }
            }
        }
    }

    locus.changes.sort_unstable_by(|a, b| a.start.cmp(&b.start));
    Ok(locus)
}

#[cfg(test)]
mod tests {
    use super::*;
    use gb_io::reader;
    use noodles::fasta;
    use std::path::PathBuf;

    fn get_unmodified_sequence() -> String {
        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("fixtures/geneious_genbank/unmodified.fa");
        let mut reader = fasta::io::reader::Builder.build_from_path(path).unwrap();
        let mut records = reader.records();
        let record = records.next().unwrap().unwrap();
        let seq = record.sequence();
        str::from_utf8(seq.as_ref()).unwrap().to_string()
    }

    #[test]
    fn test_restores_original_sequence() {
        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("fixtures/geneious_genbank/insertion.gb");
        let mut a = reader::parse_file(&path).unwrap();
        let seq = process_sequence(a.remove(0)).unwrap();
        assert_eq!(seq.original_sequence(), get_unmodified_sequence());
    }

    #[test]
    fn test_returns_changes_to_wt_sequence() {
        let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("fixtures/geneious_genbank/multiple_insertions_deletions.gb");
        let mut a = reader::parse_file(&path).unwrap();
        let seq = process_sequence(a.remove(0)).unwrap();
        let changes = seq.changes_to_wt();
        assert_eq!(changes, vec![
            GenBankEdit {
                start: 119,
                end: 237,
                old_sequence: "TGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTG".to_string(),
                new_sequence: "aact".to_string(),
                edit_type: EditType::Replacement
            }, GenBankEdit {
                start: 1425,
                end: 1425,
                old_sequence: "".to_string(),
                new_sequence: "tcagaagaactcgtcaagaaggcgatagaaggcgatgcgctgcgaatcgggagcggcgataccgtaaagcacgaggaagcggtcagcccattcgccgccaagctcttcagcaatatcacgggtagccaacgctatgtcctgatagcggtccgccacacccagccggccacagtcgatgaatccagaaaagcggccattttccaccatgatattcggcaagcaggcatcgccatgggtcacgacgagatcctcgccgtcgggcatgcgcgccttgagcctggcgaacagttcggctggcgcgagcccctgatgctcttcgtccagatcatcctgatcgacaagaccggcttccatccgagtacgtgctcgctcgatgcgatgtttcgcttggtggtcgaatgggcaggtagccggatcaagcgtatgcagccgccgcattgcatcagccatgatggatactttctcggcaggagcaaggtgagatgacaggagatcctgccccggcacttcgcccaatagcagccagtcccttcccgcttcagtgacaacgtcgagcacagctgcgcaaggaacgcccgtcgtggccagccacgatagccgcgctgcctcgtcctgcagttcattcagggcaccggacaggtcggtcttgacaaaaagaaccgggcgcccctgcgctgacagccggaacacggcggcatcagagcagccgattgtctgttgtgcccagtcatagccgaatagcctctccacccaagcggccggagaacctgcgtgcaatccatcttgttcaatcat".to_string(),
                edit_type: EditType::Insertion
            }, GenBankEdit {
                start: 3878,
                end: 4319,
                old_sequence: "TTCTTTGCTTCCTCGCCAGTTCGCTCGCTATGCTCGGTTACACGGCTGCGGCGAGCGCTAGTGATAATAAGTGACTGAGGTATGTGCTCTTCTTATCTCCTTTTGTAGTGTTGCTCTTATTTTAAACAACTTTGCGGTTTTTTGATGACTTTGCGATTTTGTTGTTGCTTTGCAGTAAATTGCAAGATTTAATAAAAAAACGCAAAGCAATGATTAAAGGATGTTCAGAATGAAACTCATGGAAACACTTAACCAGTGCATAAACGCTGGTCATGAAATGACGAAGGCTATCGCCATTGCACAGTTTAATGATGACAGCCCGGAAGCGAGGAAAATAACCCGGCGCTGGAGAATAGGTGAAGCAGCGGATTTAGTTGGGGTTTCTTCTCAGGCTATCAGAGATGCCGAGAAAGCAGGGCGACTACCGCACCCGGATATGGA".to_string(),
                new_sequence: "".to_string(),
                edit_type: EditType::Deletion
            }, GenBankEdit {
                start: 5750,
                end: 5908,
                old_sequence: "GCTTATGAACGTGGTCAGCGTTATGCAAGCCGATTGCAGAATGAATTTGCTGGAAATATTTCTGCGCTGGCTGATGCGGAAAATATTTCACGTAAGATTATTACCCGCTGTATCAACACCGCCAAATTGCCTAAATCAGTTGTTGCTCTTTTTTCTCA".to_string(),
                new_sequence: "aaattt".to_string(),
                edit_type: EditType::Replacement
            }, GenBankEdit {
                start: 5909,
                end: 5909,
                old_sequence: "".to_string(),
                new_sequence: "ccggg".to_string(),
                edit_type: EditType::Insertion
            }]);

        // apply all these changes to the WT sequence and ensure we get the final sequence out
        let mut wt_sequence = seq.original_sequence();
        assert_ne!(wt_sequence, seq.sequence);
        for change in changes.iter().rev() {
            wt_sequence = format!(
                "{}{}{}",
                &wt_sequence[..change.start as usize],
                change.new_sequence,
                &wt_sequence[change.end as usize..]
            );
        }
        assert_eq!(wt_sequence, seq.sequence);
    }
}