spoa 0.2.1

A rust wrapper around the spoa C++ SIMD partial order alignment library.
Documentation
use spoa_sys::ffi;

pub use ffi::AlignmentType;

/// An opaque struct containing an alignment of a sequence to a `Graph`
pub struct Alignment(cxx::UniquePtr<ffi::Alignment>);

/// An opaque struct for aligning sequences to a `Graph`
pub struct AlignmentEngine(cxx::UniquePtr<ffi::AlignmentEngine>);

impl AlignmentEngine {
    /// Construct a new `AlignmentEngine`, given an `AlignmentType`, a match score (`m`), a
    /// mismatch score (`n`), a gap open score (`g`), a gap extension score (`e`), a second gap
    /// open score (`q`), and a second gap extension score (`c`).
    pub fn new(typ: AlignmentType, m: i8, n: i8, g: i8, e: i8, q: i8, c: i8) -> Self {
        AlignmentEngine(ffi::create_alignment_engine(typ, m, n, g, e, q, c))
    }

    /// Align a `sequence` to a `graph`, returning an `Alignment`
    pub fn align(&mut self, sequence: &[u8], graph: &Graph) -> Alignment {
        let sequence_len = u32::try_from(sequence.len()).unwrap();
        Alignment(unsafe {
            ffi::align(
                self.0.pin_mut(),
                sequence.as_ptr() as *const i8,
                sequence_len,
                graph.0.as_ref().unwrap(),
            )
        })
    }
}

/// An opaque struct for the partial order alignment graph
pub struct Graph(cxx::UniquePtr<ffi::Graph>);

impl Graph {
    /// Construct a new, empty `Graph`.
    pub fn new() -> Self {
        Graph(ffi::create_graph())
    }

    /// Add an `Alignment` to a `Graph`, along with its `sequence` and uniform `weight`.
    ///
    /// The `alignment` should be derived from calling `AlignmentEngine::align` with the
    /// `sequence`.
    pub fn add_alignment(&mut self, alignment: &Alignment, sequence: &[u8], weight: u32) {
        let sequence_len = u32::try_from(sequence.len()).unwrap();
        unsafe {
            ffi::add_alignment(
                self.0.pin_mut(),
                alignment.0.as_ref().unwrap(),
                sequence.as_ptr() as *const i8,
                sequence_len,
                weight,
            )
        }
    }

    /// Add an `Alignment` to a `Graph`, along with its `sequence` and per-base FASTQ `quality` scores.
    ///
    /// The `alignment` should be derived from calling `AlignmentEngine::align` with the
    /// `sequence`.
    pub fn add_alignment_with_qual(
        &mut self,
        alignment: &Alignment,
        sequence: &[u8],
        quality: &[u8],
    ) {
        let sequence_len = u32::try_from(sequence.len()).unwrap();
        let quality_len = u32::try_from(quality.len()).unwrap();
        assert!(sequence_len == quality_len);
        unsafe {
            ffi::add_alignment_with_qual(
                self.0.pin_mut(),
                alignment.0.as_ref().unwrap(),
                sequence.as_ptr() as *const i8,
                sequence_len,
                quality.as_ptr() as *const i8,
                quality_len,
            )
        }
    }

    /// Generate a consenus sequence from the partial order `Graph`.
    pub fn consensus(&mut self) -> Vec<u8> {
        Vec::from(
            ffi::generate_consensus(self.0.pin_mut())
                .as_ref()
                .unwrap()
                .as_bytes(),
        )
    }

    /// Generate a consensus sequence with `min_coverage` from the partial order `Graph`.
    pub fn consensus_with_min_coverage(&mut self, min_coverage: i32) -> Vec<u8> {
        Vec::from(
            ffi::generate_consensus_with_min_coverage(self.0.pin_mut(), min_coverage)
                .as_ref()
                .unwrap()
                .as_bytes(),
        )
    }

    /// Generate a multiple sequence alignment for all sequences added to the `Graph`.
    ///
    /// If `include_consensus` is provided, the consensus sequence is provided, also aligned, at
    /// the end.
    pub fn multiple_sequence_alignment(&mut self, include_consensus: bool) -> Vec<Vec<u8>> {
        let msa = ffi::generate_multiple_sequence_alignment(self.0.pin_mut(), include_consensus);
        let mut result = vec![];
        for aln in msa.as_ref().unwrap().iter() {
            result.push(Vec::from(aln.as_bytes()));
        }
        result
    }

    /// Clear the graph
    pub fn clear(&mut self) {
        ffi::graph_clear(self.0.pin_mut())
    }
}

impl Default for Graph {
    fn default() -> Self {
        Graph::new()
    }
}

#[cfg(test)]
mod fastq;

#[cfg(test)]
mod tests {
    use super::*;

    // thanks to pjedge for this small
    const SMALL_SEQS: &[&[u8]] = &[
        b"ATTGCCCGTT",
        b"AATGCCGTT",
        b"AATGCCCGAT",
        b"AACGCCCGTC",
        b"AGTGCTCGTT",
        b"AATGCTCGTT",
    ];

    #[test]
    fn consensus_small() {
        let mut eng = AlignmentEngine::new(AlignmentType::kNW, 5, -4, -3, -1, -3, -1);
        let mut graph = Graph::new();

        for seq in SMALL_SEQS {
            let aln = eng.align(seq, &graph);
            graph.add_alignment(&aln, seq, 1);
        }

        let consensus = graph.consensus();
        assert!(consensus.as_slice() == b"AATGCCCGTT");
    }

    #[test]
    fn consensus_spoa() {
        // test borrowed from spoa itself, obviously
        let file = std::fs::File::open("spoa-sys/spoa/test/data/sample.fastq.gz").unwrap();
        let gz = flate2::read::GzDecoder::new(file);
        let reader = fastq::FastqReader::new(std::io::BufReader::new(gz));

        let mut eng = AlignmentEngine::new(AlignmentType::kSW, 5, -4, -8, -6, -8, -6);
        let mut graph = Graph::new();

        for record in reader {
            let record = record.unwrap();
            let aln = eng.align(record.seq.as_bytes(), &graph);
            graph.add_alignment_with_qual(&aln, record.seq.as_bytes(), record.qual.as_bytes());
        }

        let consensus = graph.consensus();
        let expected = b"AATGATGCGCTTTGTTGGCGCGGTGGCTTGATGCAGGGGCTAATCGACCTCTGGCAACCACTTTTCCATGACAGGAGTTGAATATGGCATTCAGTAATCCCTTCGATGATCCGCAGGGAGCGTTTTACATATTGCGCAATGCGCAGGGGCAATTCAGTCTGTGGCCGCAACAATGCGTCTTACCGGCAGGCTGGGACATTGTGTGTCAGCCGCAGTCACAGGCGTCCTGCCAGCAGTGGCTGGAAGCCCACTGGCGTACTCTGACACCGACGAATTTTACCCAGTTGCAGGAGGCACAATGAGCCAGCATTTACCTTTGGTCGCCGCACAGCCCGGCATCTGGATGGCAGAAAAACTGTCAGAATTACCCTCCGCCTGGAGCGTGGCGCATTACGTTGAGTTAACCGGAGAGGTTGATTCGCCATTACTGGCCCGCGCGGTGGTTGCCGGACTAGCGCAAGCAGATACGCTTTACACGCGCAACCAAGGATTTCGG";
        assert!(consensus.as_slice() == expected);
    }

    #[test]
    fn msa_small() {
        let mut eng = AlignmentEngine::new(AlignmentType::kNW, 5, -4, -3, -1, -3, -1);
        let mut graph = Graph::new();

        for seq in SMALL_SEQS {
            let aln = eng.align(seq, &graph);
            graph.add_alignment(&aln, seq, 1);
        }

        let msa = graph.multiple_sequence_alignment(true);
        assert!(msa.len() == 7);
        assert!(msa[0].as_slice() == b"ATTGCC-CGTT");
        assert!(msa[1].as_slice() == b"AATG-C-CGTT");
        assert!(msa[2].as_slice() == b"AATGCC-CGAT");
        assert!(msa[3].as_slice() == b"AACGCC-CGTC");
        assert!(msa[4].as_slice() == b"AGTG-CTCGTT");
        assert!(msa[5].as_slice() == b"AATG-CTCGTT");
        assert!(msa[6].as_slice() == b"AATGCC-CGTT");

        graph.clear();

        for seq in SMALL_SEQS.iter().rev() {
            let aln = eng.align(seq, &graph);
            graph.add_alignment(&aln, seq, 1);
        }

        let msa = graph.multiple_sequence_alignment(true);
        assert!(msa.len() == 7);
        assert!(msa[0].as_slice() == b"AATGCTCGTT");
        assert!(msa[1].as_slice() == b"AGTGCTCGTT");
        assert!(msa[2].as_slice() == b"AACGCCCGTC");
        assert!(msa[3].as_slice() == b"AATGCCCGAT");
        assert!(msa[4].as_slice() == b"AATGC-CGTT");
        assert!(msa[5].as_slice() == b"ATTGCCCGTT");
        assert!(msa[6].as_slice() == b"AATGCCCGTT");
    }
}