use anyhow::Result;
use derive_builder::Builder;
use crate::candidates::hla::alignment;
use crate::candidates::hla::find_variants_from_cigar;
use std::io::Write;
use std::path::PathBuf;
use polars::frame::DataFrame;
use polars::prelude::*;
use rust_htslib::bcf::{header::Header, record::GenotypeAllele, Format, Writer};
#[allow(dead_code)]
#[derive(Builder, Clone)]
pub struct Caller {
genome: PathBuf,
lineages: PathBuf,
output: PathBuf,
threads: String,
output_bcf: bool,
output_bam: bool,
}
impl Caller {
pub fn call(&mut self) -> Result<()> {
alignment(
&"virus",
&self.genome,
&self.lineages,
&self.threads,
false,
&self.output,
)?;
let bam_path: &PathBuf = &self.output.with_file_name("viruses_alignment_sorted.bam");
let (genotype_df, loci_df) = find_variants_from_cigar(&self.genome, &bam_path).unwrap();
write_to_vcf(&self.output, genotype_df, loci_df, self.output_bcf)?;
if !self.output_bam {
std::fs::remove_file(bam_path)?;
}
Ok(())
}
}
pub fn write_to_vcf(
output_path: &PathBuf,
variant_table: DataFrame,
loci_table: DataFrame,
output_bcf: bool,
) -> Result<()> {
let mut header = Header::new();
let first_row_index = variant_table["Index"]
.utf8()
.unwrap()
.into_iter()
.nth(0)
.unwrap()
.unwrap()
.split(',')
.collect::<Vec<&str>>();
let header_contig_line = format!(r#"##contig=<ID={}>"#, first_row_index[0]);
header.push_record(header_contig_line.as_bytes());
let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Variant is present in the haplotype (1) or not (0).">"#;
header.push_record(header_gt_line.as_bytes());
let header_locus_line = r#"##FORMAT=<ID=C,Number=1,Type=Integer,Description="Locus is covered by the haplotype (1) or not (0).">"#;
header.push_record(header_locus_line.as_bytes());
for sample_name in variant_table.get_column_names().iter().skip(2) {
header.push_sample(sample_name.as_bytes());
}
let format = if output_bcf { Format::Bcf } else { Format::Vcf };
let mut writer = Writer::from_path(&output_path, &header, true, format)
.expect("Failed to create VCF/BCF writer");
let _id_iter = variant_table["ID"].i64().unwrap().into_iter();
for row_index in 0..variant_table.height() {
let mut record = writer.empty_record();
let mut variant_iter = variant_table["Index"].utf8().unwrap().into_iter();
let mut id_iter = variant_table["ID"].i64().unwrap().into_iter();
let splitted = variant_iter
.nth(row_index)
.unwrap()
.unwrap()
.split(',')
.collect::<Vec<&str>>();
let chrom = splitted[0];
let pos = splitted[1];
let id = id_iter.nth(row_index).unwrap().unwrap();
let ref_base = splitted[2];
let alt_base = splitted[3];
let alleles: &[&[u8]] = &[ref_base.as_bytes(), alt_base.as_bytes()];
let rid = writer.header().name2rid(chrom.as_bytes()).unwrap();
record.set_rid(Some(rid));
record.set_pos(pos.parse::<i64>().unwrap() - 1);
record.set_id(id.to_string().as_bytes()).unwrap();
record.set_alleles(alleles).expect("Failed to set alleles");
let mut all_gt = Vec::new();
for column_index in 2..variant_table.width() {
let gt = variant_table[column_index]
.i32()
.unwrap()
.into_iter()
.nth(row_index)
.unwrap()
.unwrap();
let gt = GenotypeAllele::Phased(gt);
all_gt.push(gt);
all_gt.push(gt);
}
record.push_genotypes(&all_gt).unwrap();
let mut all_c = Vec::new();
for column_index in 2..loci_table.width() {
let c = loci_table[column_index]
.i32()
.unwrap()
.into_iter()
.nth(row_index)
.unwrap()
.unwrap();
all_c.push(c);
}
record.push_format_integer(b"C", &all_c)?;
writer.write(&record).unwrap();
}
Ok(())
}