use std::io::{BufWriter, Write};
use crate::kmer::LongKmer;
use crate::tempfile::TempFileManager;
use simple_sds_sbwt::raw_vector::*;
use rayon::prelude::*;
pub fn get_sorted_dummies<const B: usize>(sorted_kmers_filepath: &std::path::Path, sigma: usize, k: usize, temp_file_manager: &mut TempFileManager, given_mers: Option<Vec<(LongKmer<B>, u8)>>) -> Vec<(LongKmer::<B>, u8)>{
let n = std::fs::metadata(sorted_kmers_filepath).unwrap().len() as usize / LongKmer::<B>::byte_size();
let mut has_predecessor = simple_sds_sbwt::raw_vector::RawVector::new();
has_predecessor.resize(n, false);
let emptyfile = temp_file_manager.create_new_file("empty-", 10, ".bin");
let mut char_cursors = crate::bitpacked_kmer_sorting::cursors::init_char_cursors::<B>(&emptyfile.path, sorted_kmers_filepath, k, sigma);
let global_cursor = crate::bitpacked_kmer_sorting::cursors::DummyNodeMerger::new(
std::io::BufReader::new(std::fs::File::open(&emptyfile.path).unwrap()),
std::io::BufReader::new(std::fs::File::open(sorted_kmers_filepath).unwrap()),
k,
);
log::info!("Identifying k-mers without predecessors");
for (x, _) in global_cursor{
for c in 0..(sigma as u8){
let xc = x.copy_set_from_left(k-1, 0).right_shifted(1).copy_set_from_left(0, c);
while char_cursors[c as usize].peek().is_some(){
match char_cursors[c as usize].peek().unwrap().0.cmp(&xc) {
std::cmp::Ordering::Greater => break,
std::cmp::Ordering::Equal => {
has_predecessor.set_bit(char_cursors[c as usize].nondummy_position(), true);
char_cursors[c as usize].next(); break
},
std::cmp::Ordering::Less => {
char_cursors[c as usize].next(); }
}
}
}
}
let mut global_cursor = crate::bitpacked_kmer_sorting::cursors::DummyNodeMerger::new(
std::io::BufReader::new(std::fs::File::open(&emptyfile.path).unwrap()),
std::io::BufReader::new(std::fs::File::open(sorted_kmers_filepath).unwrap()),
k,
);
log::info!("Building all prefixes of source nodes");
let n_source_nodes = has_predecessor.len() - has_predecessor.count_ones();
let mut required_dummies = Vec::<(LongKmer::<B>, u8)>::with_capacity(n_source_nodes * k + 1);
while let Some((x, _)) = global_cursor.peek(){
if !has_predecessor.bit(global_cursor.nondummy_position()){
let mut prefix = x;
for i in 0..k {
let len = k - i - 1;
prefix = prefix.left_shifted(1);
required_dummies.push((prefix, len as u8));
}
}
global_cursor.next(); }
if let Some(given_mers) = given_mers {
log::info!("Adding extra dummy k-mer strings");
for (mut prefix, mut prefix_full_len) in given_mers {
if prefix_full_len as usize == k {
prefix = prefix.left_shifted(1);
prefix_full_len -= 1;
}
for i in 0..prefix_full_len {
let len = prefix_full_len - i;
required_dummies.push((prefix, len as u8));
prefix = prefix.left_shifted(1);
}
}
}
required_dummies.push((LongKmer::<B>::from_ascii(b"").unwrap(), 0));
log::info!("Sorting and deduplicating dummies");
required_dummies.par_sort_unstable();
required_dummies.dedup();
required_dummies.shrink_to_fit();
required_dummies
}
pub fn write_to_disk<const B: usize>(dummies: Vec<(LongKmer::<B>, u8)>, writer: &mut std::fs::File){
let mut bw = BufWriter::new(writer);
for (kmer, len) in dummies.iter(){
kmer.serialize(&mut bw).unwrap();
bw.write_all(&[*len]).unwrap();
}
bw.flush().unwrap();
}