use std::io::Write;
use crate::cli::{FileType, FilterType};
use crate::io_utils::set_ostream;
use crate::merge_ska_array::MergeSkaArray;
use crate::merge_ska_dict::MergeSkaDict;
use crate::ska_dict::bit_encoding::UInt;
use crate::ska_ref::RefSka;
use crate::skalo::extremities::identify_good_kmers;
use crate::skalo::input::build_graph;
use crate::skalo::read_graph::build_variant_groups;
use crate::skalo::utils::{Config, DataInfo};
pub fn align<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
output: &Option<String>,
filter: &FilterType,
mask_ambig: bool,
ignore_const_gaps: bool,
min_freq: f64,
filter_ambig_as_missing: bool,
) {
log::debug!("{ska_array}");
apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
filter,
mask_ambig,
ignore_const_gaps,
);
log::info!("Writing alignment");
let mut out_stream = set_ostream(output);
ska_array
.write_fasta(&mut out_stream)
.expect("Couldn't write output fasta");
}
pub fn map<IntT: for<'a> UInt<'a>>(
ska_array: &MergeSkaArray<IntT>,
ska_ref: &mut RefSka<IntT>,
output: &Option<String>,
format: &FileType,
threads: usize,
) {
log::info!("Converting skf to dictionary");
let ska_dict = ska_array.to_dict();
log::info!("Mapping");
ska_ref.map(&ska_dict);
let mut out_stream = set_ostream(output);
match format {
FileType::Aln => {
log::info!("Generating alignment with {threads} threads");
ska_ref
.write_aln(&mut out_stream, threads)
.expect("Failed to write output alignment");
}
FileType::Vcf => {
log::info!("Generating VCF with {threads} threads");
ska_ref
.write_vcf(&mut out_stream, threads)
.expect("Failed to write output VCF");
}
}
}
pub fn merge<IntT: for<'a> UInt<'a>>(
first_array: &MergeSkaArray<IntT>,
skf_files: &[String],
output: &str,
) {
let mut merged_dict = first_array.to_dict();
for (file_idx, skf_in) in skf_files.iter().enumerate() {
log::info!("Merging alignment {}", file_idx + 1);
let next_array = MergeSkaArray::load(skf_in)
.expect("Failed to load input file (inconsistent k-mer lengths?)");
merged_dict.extend(&mut next_array.to_dict());
}
log::info!("Converting and saving merged alignment");
save_skf(&merged_dict, output);
}
pub fn apply_filters<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
min_freq: f64,
filter_ambig_as_missing: bool,
filter: &FilterType,
ambig_mask: bool,
ignore_const_gaps: bool,
) -> i32 {
let update_kmers = false;
let filter_threshold = f64::ceil(ska_array.nsamples() as f64 * min_freq) as usize;
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
ska_array.filter(
filter_threshold,
filter_ambig_as_missing,
filter,
ambig_mask,
ignore_const_gaps,
update_kmers,
)
}
pub fn distance<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
output_prefix: &Option<String>,
min_freq: f64,
filt_ambig: bool,
) {
log::debug!("{ska_array}");
let mask_ambig = false;
let ignore_const_gaps = false;
let filter_ambig_as_missing = false;
if min_freq * ska_array.nsamples() as f64 >= 1.0 {
apply_filters(
ska_array,
min_freq,
filter_ambig_as_missing,
&FilterType::NoFilter,
mask_ambig,
ignore_const_gaps,
);
}
let constant = apply_filters(
ska_array,
0.0, filter_ambig_as_missing,
&FilterType::NoConst,
mask_ambig,
ignore_const_gaps,
);
log::info!("Calculating distances");
let distances = ska_array.distance(constant as f64, filt_ambig);
let mut f = set_ostream(output_prefix);
writeln!(
&mut f,
"Sample1\tSample2\tDistance\tMismatches (proportion)\tMatch count\tMismatch count"
)
.unwrap();
let sample_names = ska_array.names();
for (idx, (dist_vec, sample1)) in distances.iter().zip(sample_names).enumerate() {
for (dist, j) in dist_vec.iter().zip(std::ops::Range {
start: (idx + 1),
end: sample_names.len(),
}) {
writeln!(&mut f, "{sample1}\t{}\t{dist}", sample_names[j]).unwrap();
}
}
}
pub fn delete<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
delete_names: &[&str],
out_file: &str,
) {
log::info!("Deleting samples");
ska_array.delete_samples(delete_names);
let outfile_suffix = if out_file.ends_with(".skf") {
out_file.to_string()
} else {
format!("{out_file}.skf")
};
log::info!("Saving modified skf file to {outfile_suffix}");
ska_array
.save(&outfile_suffix)
.expect("Could not save modified array");
}
#[allow(clippy::too_many_arguments)]
pub fn weed<IntT: for<'a> UInt<'a>>(
ska_array: &mut MergeSkaArray<IntT>,
weed_file: &Option<String>,
reverse: bool,
min_freq: f64,
filter_ambig_as_missing: bool,
filter: &FilterType,
ambig_mask: bool,
ignore_const_gaps: bool,
out_file: &str,
) {
if let Some(weed_fasta) = weed_file {
log::info!(
"Making skf of weed file k={} rc={}",
ska_array.kmer_len(),
ska_array.rc()
);
let repeat_mask = false;
let filter_ambig = false;
let ska_weed = RefSka::new(
ska_array.kmer_len(),
weed_fasta,
ska_array.rc(),
repeat_mask,
filter_ambig,
);
if !reverse {
log::info!("Removing weed k-mers");
} else {
log::info!("Keeping only weed k-mers");
}
ska_array.weed(&ska_weed, reverse);
}
let filter_threshold = f64::floor(ska_array.nsamples() as f64 * min_freq) as usize;
if filter_threshold > 0 || *filter != FilterType::NoFilter || ambig_mask || ignore_const_gaps {
log::info!("Applying filters: threshold={filter_threshold} constant_site_filter={filter} filter_ambig_as_missing={filter_ambig_as_missing} ambig_mask={ambig_mask} no_gap_only_sites={ignore_const_gaps}");
let update_kmers = true;
ska_array.filter(
filter_threshold,
filter_ambig_as_missing,
filter,
ambig_mask,
ignore_const_gaps,
update_kmers,
);
}
log::info!("Saving modified skf file");
ska_array
.save(out_file)
.expect("Failed to save output file");
}
pub fn save_skf<IntT: for<'a> UInt<'a>>(ska_dict: &MergeSkaDict<IntT>, out_file: &str) {
let outfile_suffix = if out_file.ends_with(".skf") {
out_file.to_string()
} else {
format!("{out_file}.skf")
};
log::info!("Converting to array representation and saving");
let ska_array = MergeSkaArray::new(ska_dict);
ska_array
.save(&outfile_suffix)
.expect("Failed to save output file");
}
pub fn skalo<IntT: for<'a> UInt<'a>>(ska_array: MergeSkaArray<IntT>, config: Config) {
let (len_kmer, sample_names, all_kmers, index_map) = build_graph(ska_array, config.nb_threads);
let data_info = DataInfo {
k_graph: len_kmer - 1,
sample_names: sample_names.clone(),
};
let (start_kmers, end_kmers) = identify_good_kmers(&all_kmers, &index_map, &data_info);
build_variant_groups(
all_kmers,
start_kmers,
end_kmers,
index_map,
&config,
&data_info,
);
}