use hashbrown::{HashMap, HashSet};
const LEXICON: [u8; 5] = [b'A', b'C', b'G', b'T', b'N'];
type NullSet = HashSet<Vec<u8>>;
type PermuteMap = HashMap<Vec<u8>, Vec<u8>>;
pub struct Permuter {
map: PermuteMap,
_null: NullSet,
}
impl Permuter {
pub fn new<'a>(sequences: impl Iterator<Item = &'a Vec<u8>>) -> Self {
let (map, null) = Self::build(sequences);
Self { map, _null: null }
}
#[must_use]
pub fn contains(&self, token: &[u8]) -> Option<&Vec<u8>> {
self.map.get(token)
}
fn build<'a>(sequences: impl Iterator<Item = &'a Vec<u8>>) -> (PermuteMap, NullSet) {
sequences
.map(|seq| (seq, Self::permute_sequence(seq, LEXICON)))
.fold(
(HashMap::new(), HashSet::new()),
|(mut table, mut null), (seq, permute)| {
permute
.iter()
.for_each(|x| Self::insert_sequence(seq, x, &mut null, &mut table));
(table, null)
},
)
}
fn permute_sequence(sequence: &[u8], lexicon: [u8; 5]) -> Vec<Vec<u8>> {
sequence
.iter()
.enumerate()
.map(|(idx, _)| Self::sequence_regions(sequence, idx))
.flat_map(|(p, x, s)| Self::build_permutations(p, s, x, lexicon))
.collect()
}
fn sequence_regions(sequence: &[u8], idx: usize) -> (&[u8], &[u8], &[u8]) {
let (prefix, poschar) = sequence.split_at(idx);
let (_, suffix) = sequence.split_at(idx + 1);
(prefix, poschar, suffix)
}
fn build_permutations(
prefix: &[u8],
suffix: &[u8],
poschar: &[u8],
lexicon: [u8; 5],
) -> Vec<Vec<u8>> {
lexicon
.iter()
.filter(move |y| **y != poschar[0])
.map(|y| Self::build_permutation(prefix, suffix, *y))
.collect()
}
fn build_permutation(prefix: &[u8], suffix: &[u8], insertion: u8) -> Vec<u8> {
let mut sequence = Vec::new();
sequence.extend_from_slice(prefix);
sequence.push(insertion);
sequence.extend_from_slice(suffix);
sequence
}
fn insert_sequence(
sequence: &[u8],
permutation: &Vec<u8>,
null: &mut NullSet,
table: &mut PermuteMap,
) {
if !null.contains(sequence) {
null.insert(sequence.to_owned());
}
if !null.contains(permutation) {
if table.contains_key(permutation) {
Self::insert_to_null(permutation, null, table);
} else {
Self::insert_to_table(permutation, sequence, table);
}
}
}
fn insert_to_null(permutation: &Vec<u8>, null: &mut NullSet, table: &mut PermuteMap) {
table.remove(permutation);
null.insert(permutation.clone());
}
fn insert_to_table(permutation: &[u8], sequence: &[u8], table: &mut PermuteMap) {
table.insert(permutation.to_vec(), sequence.to_owned());
}
}
#[cfg(test)]
mod test {
use super::Permuter;
#[test]
fn build() {
let sequences = vec![b"AC".to_vec(), b"CG".to_vec()];
Permuter::new(sequences.iter());
}
#[test]
fn validate_singleton() {
let sequences = vec![b"ACTG".to_vec()];
let permuter = Permuter::new(sequences.iter());
let truth: Vec<Vec<u8>> = vec![
b"AATG", b"ACGG", b"ACAG", b"TCTG", b"ACNG", b"NCTG", b"ACTA", b"GCTG", b"AGTG",
b"ACTC", b"ATTG", b"ANTG", b"ACCG", b"ACTT", b"CCTG", b"ACTN",
]
.iter()
.map(|s| s.to_vec())
.collect();
assert!(truth.iter().all(|x| permuter.map.contains_key(x)));
assert!(truth.iter().all(|x| !permuter._null.contains(x)));
assert!(permuter._null.contains(&b"ACTG".to_vec()));
assert_eq!(permuter._null.len(), 1);
}
#[test]
fn validate_positive() {
let sequences = vec![b"AC".to_vec(), b"CG".to_vec()];
let permuter = Permuter::new(sequences.iter());
let known_positives: Vec<Vec<u8>> = vec![
b"GC", b"TC", b"NC", b"AA", b"AT", b"AN", b"CA", b"CT", b"CN", b"GG", b"TG", b"NG",
]
.iter()
.map(|s| s.to_vec())
.collect();
known_positives
.iter()
.for_each(|x| assert!(permuter.map.contains_key(x)));
assert_eq!(permuter.map.len(), 12);
known_positives
.iter()
.for_each(|x| assert!(!permuter._null.contains(x)));
}
#[test]
fn validate_negative() {
let sequences = vec![b"AC".to_vec(), b"CG".to_vec()];
let permuter = Permuter::new(sequences.iter());
let known_negatives: Vec<Vec<u8>> = vec![b"AG", b"CG", b"CC", b"AG"]
.iter()
.map(|s| s.to_vec())
.collect();
known_negatives
.iter()
.for_each(|x| assert!(permuter._null.contains(x)));
assert_eq!(permuter._null.len(), 4);
known_negatives
.iter()
.for_each(|x| assert!(!permuter.map.contains_key(x)));
}
}