use hashbrown::{HashMap, HashSet};
use seq_io::fasta::{Reader, Record};
use std::cmp::Ordering;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use flate2::read::MultiGzDecoder;
use crate::ska_dict::bit_encoding::{valid_base, UInt};
use crate::skalo::utils::{rev_compl, VariantInfo};
pub fn extract_genomic_kmers<IntT: for<'a> UInt<'a>>(
file_path: PathBuf,
k: usize,
) -> (HashMap<IntT, Vec<u32>>, Vec<u8>, String) {
let mut kmer_map: HashMap<IntT, Vec<u32>> = HashMap::new();
let mut overflow_kmers: HashSet<IntT> = HashSet::new();
let mut genome_seq: Vec<u8> = Vec::new();
let mut genome_name: String = "".to_string();
let buf = get_reader(&file_path);
let mut reader = Reader::new(buf);
let mut sequence_count = 0;
while let Some(record) = reader.next() {
sequence_count += 1;
if sequence_count > 1 {
panic!("\nError: more than one sequence detected in the reference genome file.\n");
}
let record_ready = match record {
Ok(record) => record,
Err(_) => continue, };
genome_seq = record_ready
.seq()
.iter()
.copied()
.filter(|&byte| !byte.is_ascii_whitespace())
.map(|byte| byte.to_ascii_uppercase())
.collect();
genome_name = record_ready.id().unwrap().to_string();
if genome_seq.len() >= k {
for n in 0..(genome_seq.len() - k + 1) {
let kmer = &genome_seq[n..n + k];
if kmer.iter().all(|x| valid_base(*x)) {
let kmer_encoded = IntT::encode_kmer(kmer);
if overflow_kmers.contains(&kmer_encoded) {
continue;
}
let positions = kmer_map.entry(kmer_encoded).or_insert_with(Vec::new);
if positions.len() < 3 {
positions.push((n + k) as u32);
}
if positions.len() > 3 {
kmer_map.remove(&kmer_encoded);
overflow_kmers.insert(kmer_encoded);
}
}
}
}
}
(kmer_map, genome_seq, genome_name)
}
fn get_reader(path: &PathBuf) -> Box<dyn BufRead + Send> {
let filename_str = path.to_str().unwrap();
let file = match File::open(path) {
Ok(file) => file,
Err(error) => panic!("Error opening file: {error:?}."),
};
if filename_str.ends_with(".gz") {
Box::new(BufReader::new(MultiGzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
}
}
pub fn scan_variants<IntT: for<'a> UInt<'a>>(
vec_variants: &[VariantInfo],
len_kmer_graph: usize,
kmer_map: &HashMap<IntT, Vec<u32>>,
) -> (bool, u32, String) {
let mut final_position = 0;
let mut vec_position_forward: Vec<u32> = Vec::new();
let mut vec_position_reverse: Vec<u32> = Vec::new();
for variant in vec_variants {
let seq = variant.sequence.decode();
let rc_seq = rev_compl(&seq);
for pos in 0..=seq.len() - len_kmer_graph {
let encoded_kmer = IntT::encode_kmer_str(&seq[pos..pos + len_kmer_graph]);
if let Some(vec_pos) = kmer_map.get(&encoded_kmer) {
for position in vec_pos {
vec_position_forward.push(position - pos as u32);
}
}
}
for pos in 0..=rc_seq.len() - len_kmer_graph {
let encoded_kmer = IntT::encode_kmer_str(&rc_seq[pos..pos + len_kmer_graph]);
if let Some(vec_pos) = kmer_map.get(&encoded_kmer) {
for position in vec_pos {
vec_position_reverse.push(position - pos as u32);
}
}
}
}
let forward = if !vec_position_forward.is_empty() {
let result = most_frequent_position(&vec_position_forward);
if result.1 == 0 {
None
} else {
Some(result)
}
} else {
None
};
let reverse = if !vec_position_reverse.is_empty() {
let result = most_frequent_position(&vec_position_reverse);
if result.1 == 0 {
None
} else {
Some(result)
}
} else {
None
};
let (positioned, final_orientation) = match (forward, reverse) {
(Some((pos_forward, count_forward)), Some((pos_reverse, count_reverse))) => {
match count_forward.cmp(&count_reverse) {
std::cmp::Ordering::Equal => (false, "none".to_string()),
std::cmp::Ordering::Greater => {
final_position = pos_forward;
(true, "for".to_string())
}
std::cmp::Ordering::Less => {
final_position = pos_reverse;
(true, "rc".to_string())
}
}
}
(Some((pos_forward, _)), None) => {
final_position = pos_forward;
(true, "for".to_string())
}
(None, Some((pos_reverse, _))) => {
final_position = pos_reverse;
(true, "rc".to_string())
}
(None, None) => (false, "none".to_string()),
};
(positioned, final_position, final_orientation)
}
fn most_frequent_position(numbers: &[u32]) -> (u32, usize) {
let counts = numbers.iter().fold(HashMap::new(), |mut counts, &num| {
*counts.entry(num).or_insert(0) += 1;
counts
});
let mut max_element = None;
let mut max_count = 0;
let mut tie = false;
for (&num, &count) in &counts {
match count.cmp(&max_count) {
Ordering::Greater => {
max_element = Some(num);
max_count = count;
tie = false;
}
Ordering::Equal => {
tie = true;
}
Ordering::Less => {}
}
}
if tie {
return (0, 0); }
if let Some(position) = max_element {
if max_count < 10 {
return (0, 0); } else {
return (position, max_count);
}
}
(0, 0) }