rsomics-kmer 0.2.0

K-mer encoding, canonicalisation, ntHash rolling hash, MurmurHash3, k-mer counting for the rsomics-* tool family. Layer A primitive.
Documentation
use crate::encode::{Kmer, encode};
use crate::{KmerError, Result};

pub struct KmerIter<'a> {
    seq: &'a [u8],
    k: usize,
    pos: usize,
    current: Option<Kmer>,
    canonical: bool,
}

impl<'a> KmerIter<'a> {
    pub fn new(seq: &'a [u8], k: usize, canonical: bool) -> Result<Self> {
        if !(1..=32).contains(&k) {
            return Err(KmerError::KOutOfRange(k));
        }
        if seq.len() < k {
            return Err(KmerError::SeqTooShort { len: seq.len(), k });
        }
        Ok(Self {
            seq,
            k,
            pos: 0,
            current: None,
            canonical,
        })
    }
}

impl Iterator for KmerIter<'_> {
    type Item = Result<Kmer>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.pos + self.k > self.seq.len() {
            return None;
        }
        let window = &self.seq[self.pos..self.pos + self.k];
        self.pos += 1;
        let kmer = match encode(window) {
            Ok(k) => k,
            Err(e) => {
                self.current = None;
                return Some(Err(e));
            }
        };
        let out = if self.canonical {
            crate::encode::canonical(kmer, self.k)
        } else {
            kmer
        };
        self.current = Some(out);
        Some(Ok(out))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::encode::decode;

    #[test]
    fn iter_yields_n_minus_k_plus_1_kmers() {
        let seq = b"ACGTACGT";
        let it = KmerIter::new(seq, 4, false).unwrap();
        let kmers: Vec<_> = it.collect::<Result<Vec<_>>>().unwrap();
        assert_eq!(kmers.len(), seq.len() - 4 + 1);
        let first = decode(kmers[0], 4);
        let last = decode(kmers[4], 4);
        assert_eq!(&first, b"ACGT");
        assert_eq!(&last, b"ACGT");
    }

    #[test]
    fn iter_canonical_is_idempotent_under_rc() {
        let fwd: Vec<u64> = KmerIter::new(b"ACGTACGT", 4, true)
            .unwrap()
            .collect::<Result<_>>()
            .unwrap();
        let rev_comp = b"ACGTACGT";
        let rc: Vec<u64> = KmerIter::new(rev_comp, 4, true)
            .unwrap()
            .collect::<Result<_>>()
            .unwrap();
        let mut a = fwd;
        let mut b = rc;
        a.sort_unstable();
        b.sort_unstable();
        assert_eq!(a, b);
    }

    #[test]
    fn iter_too_short_seq_errors() {
        assert!(matches!(
            KmerIter::new(b"AC", 4, false),
            Err(KmerError::SeqTooShort { len: 2, k: 4 })
        ));
    }

    #[test]
    fn iter_propagates_non_acgt_error() {
        let seq = b"ACGTNAC";
        let mut it = KmerIter::new(seq, 4, false).unwrap();
        assert!(it.next().unwrap().is_ok());
        let third = it.next().unwrap();
        assert!(matches!(third, Err(KmerError::NonAcgt { .. })));
    }
}