use std::cmp::Ordering;
use hashbrown::HashMap;
extern crate needletail;
use needletail::{parse_fastx_file, parser::Format};
pub mod split_kmer;
use super::QualOpts;
use crate::ska_dict::split_kmer::SplitKmer;
pub mod bit_encoding;
use crate::ska_dict::bit_encoding::{decode_base, UInt, IUPAC};
pub mod bloom_filter;
use crate::ska_dict::bloom_filter::KmerFilter;
pub mod nthash;
#[derive(Debug, Clone, Default)]
pub struct SkaDict<IntT> {
k: usize,
rc: bool,
sample_idx: usize,
name: String,
split_kmers: HashMap<IntT, u8>,
kmer_filter: KmerFilter,
}
impl<IntT> SkaDict<IntT>
where
IntT: for<'a> UInt<'a>,
{
fn add_to_dict(&mut self, kmer: IntT, base: u8) {
self.split_kmers
.entry(kmer)
.and_modify(|b| *b = IUPAC[base as usize * 256 + *b as usize])
.or_insert(decode_base(base));
}
fn add_palindrome_to_dict(&mut self, kmer: IntT, base: u8) {
self.split_kmers
.entry(kmer)
.and_modify(|b| {
*b = match b {
b'W' => {
if base == 0 || base == 2 {
b'W'
} else {
b'N'
}
}
b'S' => {
if base == 0 || base == 2 {
b'N'
} else {
b'S'
}
}
b'N' => b'N',
_ => panic!("Palindrome middle base not W/S: {}", *b as char),
}
})
.or_insert(match base {
0 | 2 => b'W', 1 | 3 => b'S', _ => panic!("Base encoding error: {}", base as char),
});
}
fn add_file_kmers(
&mut self,
filename: &str,
is_reads: bool,
qual: &QualOpts,
proportion_reads: Option<f64>,
) {
let mut step: usize = 1;
if let Some(prop) = proportion_reads {
step = (1.0 / prop).round() as usize;
}
let mut reader =
parse_fastx_file(filename).unwrap_or_else(|_| panic!("Invalid path/file: {filename}"));
let mut iter_reads: usize = 0;
while let Some(record) = reader.next() {
if !iter_reads.is_multiple_of(step) {
iter_reads += 1;
continue;
} else {
iter_reads += 1;
}
let seqrec = record.expect("Invalid FASTA/Q record");
let kmer_opt = SplitKmer::new(
seqrec.seq(),
seqrec.num_bases(),
seqrec.qual(),
self.k,
self.rc,
qual.min_qual,
qual.qual_filter,
is_reads,
);
if let Some(mut kmer_it) = kmer_opt {
if !is_reads
|| (kmer_it.middle_base_qual()
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
let (kmer, base, _rc) = kmer_it.get_curr_kmer();
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
while let Some((kmer, base, _rc)) = kmer_it.get_next_kmer() {
if !is_reads
|| (kmer_it.middle_base_qual()
&& Ordering::is_eq(self.kmer_filter.filter(&kmer_it)))
{
if kmer_it.self_palindrome() {
self.add_palindrome_to_dict(kmer, base);
} else {
self.add_to_dict(kmer, base);
}
}
}
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn new(
k: usize,
sample_idx: usize,
files: (&str, Option<&String>),
name: &str,
rc: bool,
qual: &QualOpts,
proportion_reads: Option<f64>,
) -> Self {
if !(5..=63).contains(&k) || k.is_multiple_of(2) {
panic!("Invalid k-mer length");
}
let mut sk_dict = Self {
k,
rc,
sample_idx,
name: name.to_string(),
split_kmers: HashMap::default(),
kmer_filter: KmerFilter::new(qual.min_count),
};
let mut reader_peek =
parse_fastx_file(files.0).unwrap_or_else(|_| panic!("Invalid path/file: {}", files.0));
let seq_peek = reader_peek
.next()
.expect("Invalid FASTA/Q record")
.expect("Invalid FASTA/Q record");
let mut is_reads = false;
if seq_peek.format() == Format::Fastq {
sk_dict.kmer_filter.init();
is_reads = true;
}
sk_dict.add_file_kmers(files.0, is_reads, qual, proportion_reads);
if let Some(second_filename) = files.1 {
sk_dict.add_file_kmers(second_filename, is_reads, qual, proportion_reads);
}
if sk_dict.ksize() == 0 {
panic!("{} has no valid sequence", files.0);
}
sk_dict
}
pub fn kmer_len(&self) -> usize {
self.k
}
pub fn rc(&self) -> bool {
self.rc
}
pub fn ksize(&self) -> usize {
self.split_kmers.len()
}
pub fn idx(&self) -> usize {
self.sample_idx
}
pub fn kmers(&self) -> &HashMap<IntT, u8> {
&self.split_kmers
}
pub fn name(&self) -> &String {
&self.name
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_add_palindrome_to_dict() {
let mut test_obj = SkaDict::<u64>::default();
test_obj.split_kmers.insert(123, b'W');
test_obj.add_palindrome_to_dict(123, 1);
assert_eq!(test_obj.split_kmers[&123], b'N');
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(456, 0);
assert_eq!(test_obj.split_kmers[&456], b'W');
test_obj.split_kmers.clear();
test_obj.add_palindrome_to_dict(789, 3);
assert_eq!(test_obj.split_kmers[&789], b'S');
test_obj.split_kmers.insert(123, b'W');
test_obj.add_palindrome_to_dict(123, 1);
test_obj.add_palindrome_to_dict(123, 1);
assert_eq!(test_obj.split_kmers[&123], b'N');
}
#[test]
#[should_panic]
fn test_panic_add_palindrome_to_dict() {
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.add_palindrome_to_dict(987, 5);
}
#[test]
#[should_panic]
fn test_panic2_add_palindrome_to_dict() {
let mut test_obj_panic = SkaDict::<u64>::default();
test_obj_panic.split_kmers.clear();
test_obj_panic.split_kmers.insert(555, b'A');
test_obj_panic.add_palindrome_to_dict(555, 1);
}
}