use std::cmp::Ordering;
use pulp::Arch;
use twox_hash::xxh3::Hash64;
use std::hash::Hasher;
pub fn complement(c: &mut u8) {
let val = *c;
let new_val = if val != b'N' {
if val & 2 != 0 {
val ^ 4
} else {
val ^ 21
}
} else {
val
};
*c = new_val;
}
pub fn revcomp(sequence: &mut [u8]) {
let arch = Arch::new();
arch.dispatch(|| {
sequence.reverse();
sequence.iter_mut().for_each(complement);
});
}
pub fn is_revcomp_min(seq: &[u8]) -> bool {
assert!(!seq.is_empty());
for i in 0..seq.len() {
let mut c = seq[seq.len() - i - 1];
complement(&mut c);
match seq[i].cmp(&c) {
Ordering::Less => return true,
Ordering::Greater => return false,
Ordering::Equal => continue,
}
}
false
}
pub fn find_syncmers<'a, const N: usize>(
k: usize,
s: usize,
ts: &[usize; N],
downsample: Option<f64>,
seq: &'a [u8],
) -> Vec<&'a [u8]> {
assert!(seq.len() > k);
assert!(s < k);
assert!(ts.iter().all(|&t| t <= k - s));
assert!(N < 5);
assert!(N == ts.len());
let mut downsample_threshold = None;
if let Some(downsample) = downsample {
assert!(downsample > 0.0);
assert!(downsample <= 1.0);
downsample_threshold = Some((std::u64::MAX as f64 * downsample) as u64);
}
let syncmer_positions = find_syncmers_pos(k, s, ts, seq);
if downsample.is_none() {
syncmer_positions
.iter()
.map(|&pos| &seq[pos..pos + k])
.collect()
} else {
syncmer_positions
.iter()
.map(|&pos| &seq[pos..pos + k])
.filter(|&syncmer| {
let mut hash = Hash64::with_seed(42);
hash.write(syncmer);
let hash = hash.finish();
hash < downsample_threshold.unwrap()
})
.collect()
}
}
#[allow(clippy::if_same_then_else)]
pub fn find_syncmers_pos<const N: usize>(
k: usize,
s: usize,
ts: &[usize; N],
seq: &[u8],
) -> Vec<usize> {
assert!(seq.len() > k);
assert!(s < k);
assert!(ts.iter().all(|&t| t <= k - s));
assert!(N < 5);
assert!(N == ts.len());
seq.windows(k)
.enumerate()
.filter_map(|(i, kmer)| {
let min_pos = kmer
.windows(s)
.enumerate()
.min_by(|(_, a), (_, b)| a.cmp(b));
if N == 1 && ts[0] == min_pos.unwrap().0 {
Some(i)
} else if N != 1 && ts[0..N].contains(&min_pos.unwrap().0) {
Some(i)
} else {
None
}
})
.collect::<Vec<_>>()
}
pub struct Syncmers<'syncmer, const N: usize> {
pub k: usize,
pub s: usize,
pub t: &'syncmer [usize; N],
pub seq: &'syncmer [u8],
pos: usize,
}
impl<'syncmer, const N: usize> Syncmers<'syncmer, N> {
pub fn new(k: usize, s: usize, t: &'syncmer [usize; N], seq: &'syncmer [u8]) -> Self {
assert!(s < k);
assert!(t.iter().all(|&x| x <= (k - s)));
Syncmers {
k,
s,
t,
seq,
pos: 0,
}
}
}
#[allow(clippy::if_same_then_else)]
impl<'syncmer, const N: usize> Iterator for Syncmers<'syncmer, N> {
type Item = &'syncmer [u8];
fn next(&mut self) -> Option<Self::Item> {
if self.seq.len() - self.pos < self.k {
return None;
}
let kmer = &self.seq[self.pos..self.pos + self.k];
let min_pos = kmer
.windows(self.s)
.enumerate()
.min_by(|(_, a), (_, b)| a.cmp(b));
self.pos += 1;
if N == 1 && self.t[0] == min_pos.unwrap().0 {
Some(kmer)
} else if N != 1 && self.t[0..N].contains(&min_pos.unwrap().0) {
Some(kmer)
} else {
self.next()
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
pub fn test_syncmers_fig1b() {
let sequence = b"CCAGTGTTTACGG";
let syncmer_positions = find_syncmers_pos(5, 2, &[2], sequence);
println!("{:?}", syncmer_positions);
assert!(syncmer_positions == vec![0, 7]);
let sequence = b"CCAGTGTTTACGG";
let syncmers = find_syncmers(5, 2, &[2], None, sequence);
assert!(syncmers == vec![b"CCAGT", b"TTACG"]);
println!("{:?}", syncmers);
let sequence = b"CCAGTGTTTACGG";
let syncmer_positions = find_syncmers_pos(5, 2, &[2, 3], sequence);
println!("{:?}", syncmer_positions);
assert!(syncmer_positions == vec![0, 6, 7]);
}
}