sgcount 0.1.35

A fast and flexible sgRNA counter
use super::{Library, Permuter};
use crate::Offset;
use fxread::{FastxRead, Record};
use hashbrown::HashMap;

#[derive(Debug, PartialEq)]
enum Position {
    Plus,
    Minus,
    Centered,
    Null,
}

/// Struct to handle the mapping between the trimmed records generated by [`FastxRead`]
/// and the sequences found in the [`Library`]. It also has an optional argument for
/// unambiguous sequence permutations contained within [`Permuter`].
pub struct Counter {
    results: HashMap<Vec<u8>, usize>,
    total_reads: usize,
    matched_reads: usize,
}
impl Counter {
    /// Initializes a counter from a hashmap directly (for testing)
    pub fn from_hashmap(map: HashMap<Vec<u8>, usize>) -> Self {
        Self {
            results: map,
            total_reads: 0,
            matched_reads: 0,
        }
    }

    /// Initializes counting of reads from the [`FastxRead`] object within
    /// the [`Library`]. Optional argument for the unambiguous sequence permutations
    /// contained within [`Permuter`].
    #[must_use]
    pub fn new(
        reader: Box<dyn FastxRead<Item = Record>>,
        library: &Library,
        permuter: &Option<Permuter>,
        offset: Offset,
        size: usize,
        position_recursion: bool,
    ) -> Self {
        let position = if position_recursion {
            Position::Centered
        } else {
            Position::Null
        };
        let mut total_reads = 0;
        let mut matched_reads = 0;
        let results = Self::count(
            reader,
            library,
            permuter,
            offset,
            size,
            &position,
            &mut total_reads,
            &mut matched_reads,
        );
        Self {
            results,
            total_reads,
            matched_reads,
        }
    }

    /// Publically exposes the results dictionary and returns either the observed count
    /// of a specific match or a zero if there were no matches.
    #[must_use]
    pub fn get_value(&self, token: &[u8]) -> &usize {
        match self.results.get(token) {
            Some(c) => c,
            None => &0,
        }
    }

    /// Assignment process against the [`Library`].
    fn check_library<'a>(token: &[u8], library: &'a Library) -> Option<&'a Vec<u8>> {
        library.contains(token)
    }

    /// Assignment process against the [`Permuter`]. This will only execute if the [`Permuter`] is
    /// optionally not [`None`].
    fn check_permuter<'a>(token: &[u8], permuter: &'a Option<Permuter>) -> Option<&'a Vec<u8>> {
        match permuter {
            Some(p) => p.contains(token),
            None => None,
        }
    }

    /// Assignment process of the struct performing the string matching.
    /// Tokens are first matched against the library (exact matches)
    /// and if none are found then are matched against the permutations (oneoff matches).
    /// If still none are found then it returns [`None`].
    fn assign<'a>(
        record: &Record,
        library: &'a Library,
        permuter: &Option<Permuter>,
        offset: Offset,
        size: usize,
        position: &Position,
    ) -> Option<&'a Vec<u8>> {
        // Apply Trimming to Record
        let token = match Self::apply_trim(record, offset, size, position) {
            Some(t) => t,
            None => return None,
        };

        // Check trimmed sequence against library
        let alias = match Self::check_library(&token, library) {
            Some(s) => Some(s),
            None => match Self::check_permuter(&token, permuter) {
                Some(s) => library.alias(s),
                None => None,
            },
        };

        // If no alias is found try offsetting the position by one
        if alias.is_none() {
            match position {
                // Try offsetting +1
                Position::Centered => {
                    Self::assign(record, library, permuter, offset, size, &Position::Plus)
                }

                // Try offsetting -1
                Position::Plus => {
                    Self::assign(record, library, permuter, offset, size, &Position::Minus)
                }

                // Both positions have been tried or no recursion option so return the null
                Position::Minus | Position::Null => alias,
            }
        }
        // Otherwise return the library alias
        else {
            alias
        }
    }

    /// Applies the correct trimming function depending on the offset direction and position
    #[inline]
    fn apply_trim(
        record: &Record,
        offset: Offset,
        size: usize,
        position: &Position,
    ) -> Option<Vec<u8>> {
        match offset {
            Offset::Forward(index) => Self::trim_forward_sequence(record, index, size, position),
            Offset::Reverse(index) => Self::trim_reverse_sequence(record, index, size, position),
        }
    }

    /// Calculates the bounds of the trimmed sequence given a positional offset
    #[inline]
    fn bounds(
        seq: &[u8],
        offset: usize,
        size: usize,
        position: &Position,
    ) -> Option<(usize, usize)> {
        let (min, max) = match position {
            Position::Plus => (offset + 1, offset + 1 + size),
            Position::Minus => {
                let min = match offset.checked_sub(1) {
                    Some(m) => m,
                    None => return None,
                };
                (min, min + size)
            }
            _ => (offset, offset + size),
        };
        if max > seq.len() {
            return None;
        } else {
            Some((min, max))
        }
    }

    /// Trims the forward sequence to the required boundaries
    #[inline]
    fn trim_forward_sequence(
        record: &Record,
        offset: usize,
        size: usize,
        position: &Position,
    ) -> Option<Vec<u8>> {
        Self::bounds(record.seq(), offset, size, position)
            .map(|(min, max)| record.seq()[min..max].to_vec())
    }

    /// Trims the reverse complemented sequence to the required boundaries
    #[inline]
    fn trim_reverse_sequence(
        record: &Record,
        offset: usize,
        size: usize,
        position: &Position,
    ) -> Option<Vec<u8>> {
        Self::bounds(record.seq(), offset, size, position)
            .map(|(min, max)| record.seq_rev_comp()[min..max].to_vec())
    }

    /// Main functionality of the struct. Performs the counting operation.
    /// Input sequences are assigned to their respective match within the
    /// library or the permutations.
    /// Finally they are folded into a [`HashMap`] whose keys are the sequence
    /// alias within the library and the values the number of observed counts.
    fn count(
        reader: Box<dyn FastxRead<Item = Record>>,
        library: &Library,
        permuter: &Option<Permuter>,
        offset: Offset,
        size: usize,
        position: &Position,
        total_reads: &mut usize,
        matched_reads: &mut usize,
    ) -> HashMap<Vec<u8>, usize> {
        reader
            .into_iter()
            .map(|x| {
                *total_reads += 1;
                x
            })
            .filter_map(|x| Self::assign(&x, library, permuter, offset, size, position))
            .map(|x| {
                *matched_reads += 1;
                x
            })
            .fold(HashMap::new(), |mut accum, x| {
                *accum.entry(x.clone()).or_insert(0) += 1;
                accum
            })
    }

    /// Returns the total number of reads processed
    pub fn total_reads(&self) -> usize {
        self.total_reads
    }

    /// Returns the number of reads that matched the library
    pub fn matched_reads(&self) -> usize {
        self.matched_reads
    }

    /// Returns the fraction of reads that matched the library
    pub fn fraction_mapped(&self) -> f64 {
        self.matched_reads as f64 / self.total_reads as f64
    }
}

#[cfg(test)]
mod test {

    use super::{Counter, Library, Permuter, Position};
    use crate::Offset;
    use fxread::{FastaReader, FastxRead, Record};

    fn trim_reader(distance: bool) -> Box<dyn FastxRead<Item = Record>> {
        let sequence: &'static [u8] = match distance {
            false => b">seq.0\nACTG\n",
            true => b">seq.0\nAGTG\n",
        };
        Box::new(FastaReader::new(sequence))
    }

    fn lib_reader() -> Box<dyn FastxRead<Item = Record>> {
        let sequence: &'static [u8] = b">seq.0\nACTG\n";
        Box::new(FastaReader::new(sequence))
    }

    fn library() -> Library {
        Library::from_reader(lib_reader()).unwrap()
    }

    fn permuter() -> Permuter {
        Permuter::new(library().keys())
    }

    #[test]
    fn count_no_distance_no_permute() {
        let trimmer = trim_reader(false);
        let library = library();
        let count = Counter::new(trimmer, &library, &None, Offset::Forward(0), 4, false);
        assert_eq!(*count.get_value(b"seq.0"), 1);
    }

    #[test]
    fn count_no_distance_with_permute() {
        let trimmer = trim_reader(true);
        let library = library();
        let count = Counter::new(trimmer, &library, &None, Offset::Forward(0), 4, false);
        assert_eq!(*count.get_value(b"seq.0"), 0);
    }

    #[test]
    fn count_with_distance_no_permute() {
        let trimmer = trim_reader(true);
        let library = library();
        let count = Counter::new(trimmer, &library, &None, Offset::Forward(0), 4, false);
        assert_eq!(*count.get_value(b"seq.0"), 0);
    }

    #[test]
    fn count_with_distance_with_permute() {
        let trimmer = trim_reader(true);
        let library = library();
        let permuter = permuter();
        let count = Counter::new(
            trimmer,
            &library,
            &Some(permuter),
            Offset::Forward(0),
            4,
            false,
        );
        assert_eq!(*count.get_value(b"seq.0"), 1);
    }

    #[test]
    fn bounds_checking_standard() {
        let seq = b"ACTGACTGACTG".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Null;
        let (min, max) = Counter::bounds(seq, offset, size, &position).unwrap();
        assert_eq!(min, 4);
        assert_eq!(max, 8);
    }
    #[test]
    fn bounds_checking_plus() {
        let seq = b"ACTGACTGACTG".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Plus;
        let (min, max) = Counter::bounds(seq, offset, size, &position).unwrap();
        assert_eq!(min, 5);
        assert_eq!(max, 9);
    }

    #[test]
    fn bounds_checking_minus() {
        let seq = b"ACTGACTGACTG".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Minus;
        let (min, max) = Counter::bounds(seq, offset, size, &position).unwrap();
        assert_eq!(min, 3);
        assert_eq!(max, 7);
    }

    #[test]
    fn bounds_checking_standard_clipped() {
        let seq = b"ACTGACT".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Null;
        let res = Counter::bounds(seq, offset, size, &position);
        assert!(res.is_none());
    }

    #[test]
    fn bounds_checking_plus_clipped() {
        let seq = b"ACTGACT".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Plus;
        let res = Counter::bounds(seq, offset, size, &position);
        assert!(res.is_none());
    }

    #[test]
    fn bounds_checking_minus_clipped() {
        let seq = b"ACTGAC".as_slice();
        let offset = 4;
        let size = 4;
        let position = Position::Minus;
        let res = Counter::bounds(seq, offset, size, &position);
        assert!(res.is_none());
    }
}