use std::error::Error;
use std::fs::File;
use std::io::{stdout, BufRead, BufReader, BufWriter, Write};
use std::path::Path;
use regex::Regex;
use super::QualOpts;
use crate::cli::check_threads;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::{build_and_merge, InputFastx};
use crate::ska_dict::bit_encoding::UInt;
use crate::CoverageHistogram;
use crate::cli::{
DEFAULT_KMER, DEFAULT_MINCOUNT, DEFAULT_MINQUAL, DEFAULT_PROPORTION_READS, DEFAULT_QUALFILTER,
DEFAULT_STRAND,
};
use crate::ValidMinKmer;
pub fn read_input_fastas(seq_files: &[String]) -> Vec<InputFastx> {
let mut input_files = Vec::new();
let re_path = Regex::new(r"^.+/(.+)\.(?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
let re_name = Regex::new(r"^(.+)\.(?i:fa|fasta|fastq|fastq\.gz)$").unwrap();
for file in seq_files {
let caps = re_path.captures(file).or(re_name.captures(file));
let name = match caps {
Some(capture) => capture[1].to_string(),
None => file.to_string(),
};
input_files.push((name, file.to_string(), None));
}
input_files
}
pub fn load_array<IntT: for<'a> UInt<'a>>(
input: &[String],
threads: usize,
) -> Result<MergeSkaArray<IntT>, Box<dyn Error>> {
if input.len() == 1 {
log::info!(
"Single file as input, trying to load as skf {}-bits",
IntT::n_bits()
);
if threads > 1 {
log::warn!("--threads only used if building skf, setting to 1");
check_threads(1);
}
MergeSkaArray::load(input[0].as_str())
} else {
log::info!("Multiple files as input, running ska build with default settings");
let input_files = read_input_fastas(input);
let default_qual = QualOpts {
min_count: DEFAULT_MINCOUNT,
min_qual: DEFAULT_MINQUAL,
qual_filter: DEFAULT_QUALFILTER,
};
let merged_dict = build_and_merge(
&input_files,
DEFAULT_KMER,
!DEFAULT_STRAND,
&default_qual,
threads,
DEFAULT_PROPORTION_READS,
);
Ok(MergeSkaArray::new(&merged_dict))
}
}
pub fn set_ostream(oprefix: &Option<String>) -> BufWriter<Box<dyn Write>> {
let out_writer = match oprefix {
Some(prefix) => {
let path = Path::new(prefix);
Box::new(File::create(path).unwrap()) as Box<dyn Write>
}
None => Box::new(stdout()) as Box<dyn Write>,
};
BufWriter::new(out_writer)
}
pub fn get_input_list(
file_list: &Option<String>,
seq_files: &Option<Vec<String>>,
) -> Vec<InputFastx> {
match file_list {
Some(files) => {
let mut input_files: Vec<InputFastx> = Vec::new();
let f = File::open(files).expect("Unable to open file_list");
let f = BufReader::new(f);
for line in f.lines() {
let line = line.expect("Unable to read line in file_list");
let fields: Vec<&str> = line.split_whitespace().collect();
let second_file = match fields.len() {
0..=1 => {
panic!("Unable to parse line in file_list")
}
2 => None,
3 => Some(fields[2].to_string()),
_ => {
panic!("Unable to parse line in file_list")
}
};
input_files.push((fields[0].to_string(), fields[1].to_string(), second_file));
}
input_files
}
None => read_input_fastas(seq_files.as_ref().unwrap()),
}
}
pub fn any_fastq(files: &[InputFastx]) -> bool {
files.iter().any(|file| file.2.is_some())
}
pub fn count_fastq(files: &[InputFastx]) -> usize {
files.iter().filter(|file| file.2.is_some()).count()
}
pub fn get_2_fastq_path(files: &[InputFastx]) -> (String, String) {
let out: Vec<String> = files
.iter()
.filter(|file| file.2.is_some())
.take(2)
.map(|x| x.1.clone())
.collect();
if out.len().gt(&1) {
(out[0].clone(), out[1].clone())
} else {
panic!("Trying to get 2 fastq files from a vector with <2 elements");
}
}
pub fn kmer_min_cutoff<IntT: for<'a> UInt<'a>>(
v: &Option<ValidMinKmer>,
files: &[InputFastx],
k: &usize,
rc: bool,
verbose: bool,
) -> u16 {
if v.is_none() {
log::info!("Using user-provided minimum kmer value of {DEFAULT_MINCOUNT}",);
DEFAULT_MINCOUNT
} else {
match v.unwrap() {
ValidMinKmer::Val(x) => {
log::info!("Using user-provided minimum kmer value of {x}");
x
}
ValidMinKmer::Auto if count_fastq(files).ge(&2) => {
let (fastq_fwd, fastq_rev) = get_2_fastq_path(files);
let mut cov =
CoverageHistogram::<IntT>::new(&fastq_fwd, &fastq_rev, *k, rc, verbose);
let out = cov.fit_histogram().expect("Couldn't fit coverage model") as u16;
cov.plot_hist();
log::info!("Using inferred minimum kmer value of {out}");
out
}
ValidMinKmer::Auto => {
log::info!(
"Not enough fastq files to fit mixture model, using default kmer count of 5"
);
DEFAULT_MINCOUNT
}
}
}
}