use hashbrown::{HashMap, HashSet};
use std::fs::File;
use std::io::Write;
use crate::skalo::utils::Config;
pub fn create_fasta_and_vcf(
genome_name: String,
mut genome_seq: Vec<u8>,
sample_names: Vec<String>,
variant_map: HashMap<u32, Vec<char>>,
config: &Config,
) {
for base in genome_seq.iter_mut() {
match *base as char {
'A' | 'T' | 'G' | 'C' | 'N' => {} _ => *base = b'N', }
}
let mut sorted_map: Vec<_> = variant_map.into_iter().collect();
sorted_map.sort_by_key(|&(key, _)| key);
let mut sequences: Vec<String> = vec![String::new(); sample_names.len()];
let mut genome_alignments: Option<Vec<String>> = None; let mut vcf_records: Vec<(u32, char, Vec<char>)> = Vec::new();
if !genome_seq.is_empty() {
genome_alignments = Some(vec![String::new(); sample_names.len()]);
}
let mut current_snp_index = 0;
let genome_length = if !genome_seq.is_empty() {
genome_seq.len() as u32
} else {
sorted_map.last().map_or(0, |(pos, _)| *pos + 1)
};
for pos in 0..genome_length {
if current_snp_index < sorted_map.len() && sorted_map[current_snp_index].0 == pos {
let (snp_pos, vec_chars) = &sorted_map[current_snp_index];
if let Some(ref_genome_alignments) = genome_alignments.as_mut() {
let reference_base = genome_seq[*snp_pos as usize] as char;
vcf_records.push((*snp_pos, reference_base, vec_chars.clone()));
for (i, &char) in vec_chars.iter().enumerate() {
ref_genome_alignments[i].push(char);
}
}
for (i, &char) in vec_chars.iter().enumerate() {
sequences[i].push(char);
}
current_snp_index += 1;
} else if let Some(ref_genome_alignments) = genome_alignments.as_mut() {
let ref_base = genome_seq[pos as usize] as char;
for genome_alignment in ref_genome_alignments.iter_mut() {
genome_alignment.push(ref_base);
}
}
}
let snp_filename = format!("{}_snps.fas", config.output_name);
let mut snp_output = File::create(snp_filename).expect("Unable to create SNP file");
for (name, sequence) in sample_names.iter().zip(sequences.iter()) {
writeln!(snp_output, ">{name}").expect("Error writing to SNP file");
writeln!(snp_output, "{sequence}").expect("Error writing to SNP file");
}
if !genome_seq.is_empty() {
let genome_filename = format!("{}_pseudo_genomes.fas", config.output_name);
let genome_output =
File::create(genome_filename).expect("Unable to create genome alignment file");
if let Some(ref_genome_alignments) = genome_alignments {
for (name, alignment) in sample_names.iter().zip(ref_genome_alignments.iter()) {
writeln!(&genome_output, ">{name}")
.expect("Error writing to genome alignment file");
writeln!(&genome_output, "{alignment}")
.expect("Error writing to genome alignment file");
}
}
let vcf_filename = format!("{}_snps.vcf", config.output_name);
let mut vcf_output = File::create(vcf_filename).expect("Unable to create VCF file");
writeln!(vcf_output, "##fileformat=VCFv4.2").expect("Error writing VCF header");
writeln!(
vcf_output,
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}",
sample_names.join("\t")
)
.expect("Error writing VCF header");
for (pos, reference_base, vec_chars) in vcf_records {
let alt_bases: Vec<char> = vec_chars
.iter()
.cloned()
.filter(|&c| c != reference_base && c != '-' && c != 'N')
.collect::<HashSet<_>>() .into_iter()
.collect();
let genotypes: Vec<String> = vec_chars
.iter()
.map(|&c| {
if c == reference_base {
"0".to_string()
} else if c == '-' || c == 'N' {
".".to_string() } else if let Some(alt_index) = alt_bases.iter().position(|&alt| alt == c) {
(alt_index + 1).to_string() } else {
".".to_string() }
})
.collect();
writeln!(
vcf_output,
"{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t{}",
genome_name,
pos + 1, reference_base,
alt_bases
.iter()
.map(|&c| c.to_string())
.collect::<Vec<_>>()
.join(","),
genotypes.join("\t")
)
.expect("Error writing VCF record");
}
}
}