use log::info;
use std::collections::HashSet;
use std::path::PathBuf;
use needletail::{parse_fastx_file, Sequence};
use rayon::prelude::*;
use crate::types::*;
use crate::{dist, hd, utils};
#[cfg(target_arch = "x86_64")]
pub fn sketch(params: SketchParams) {
let files = utils::get_fasta_files(¶ms.path);
let n_file = files.len();
info!("Start sketching...");
let pb = utils::get_progress_bar(n_file);
let mut all_filesketch: Vec<FileSketch> = (0..n_file)
.into_iter()
.map(|i| FileSketch {
ksize: params.ksize,
scaled: params.scaled,
seed: params.seed,
canonical: params.canonical,
hv_d: params.hv_d,
hv_quant_bits: 16 as u8,
hv_norm_2: 0 as i32,
file_str: files[i].display().to_string(),
hv: Vec::<i16>::new(),
})
.collect();
all_filesketch.par_iter_mut().for_each(|sketch| {
let kmer_hash_set = extract_kmer_hash(&sketch);
let hv = if is_x86_feature_detected!("avx2") {
unsafe { hd::encode_hash_hd_avx2(&kmer_hash_set, &sketch) }
} else {
hd::encode_hash_hd(&kmer_hash_set, &sketch)
};
sketch.hv_norm_2 = dist::compute_hv_l2_norm(&hv);
if params.if_compressed {
sketch.hv_quant_bits = unsafe { hd::compress_hd_sketch(sketch, &hv) };
}
pb.inc(1);
pb.eta();
});
pb.finish_and_clear();
info!(
"Sketching {} files took {:.2}s - Speed: {:.1} files/s",
n_file,
pb.elapsed().as_secs_f32(),
pb.per_sec()
);
utils::dump_sketch(&all_filesketch, ¶ms.out_file);
}
fn extract_kmer_hash(sketch: &FileSketch) -> HashSet<u64> {
let ksize = sketch.ksize;
let threshold = u64::MAX / sketch.scaled;
let seed = sketch.seed;
let mut fastx_reader = parse_fastx_file(PathBuf::from(sketch.file_str.clone()))
.expect("Opening .fna files failed");
let mut hash_set = HashSet::<u64>::new();
while let Some(record) = fastx_reader.next() {
let seqrec: needletail::parser::SequenceRecord<'_> = record.expect("invalid record");
let norm_seq = seqrec.normalize(false);
let rc = norm_seq.reverse_complement();
for (_, kmer, _) in norm_seq.canonical_kmers(ksize, &rc) {
let h = t1ha::t1ha2_atonce(kmer, seed);
if h < threshold {
hash_set.insert(h);
}
}
}
hash_set
}