use anyhow::Result;
use derive_builder::Builder;
use csv::ReaderBuilder;
use std::fs;
use std::fs::File;
use std::io::Write;
use std::path::PathBuf;
use std::process::{Command, Stdio};
use tempfile::tempdir;
#[derive(Builder, Clone)]
pub struct Caller {
genome: PathBuf,
reads: Option<Vec<PathBuf>>,
bam_input: Option<PathBuf>,
haplotype_variants: PathBuf,
bwa_index: Option<PathBuf>,
vg_index: PathBuf,
output: PathBuf,
threads: String,
output_bam: bool,
omit_homopolymer_artifact_detection: bool,
}
impl Caller {
pub fn call(&self) -> Result<()> {
let outdir = &self.output;
let mut parent = outdir.clone();
parent.pop();
dbg!(&parent);
let _cargo_dir = env!("CARGO_MANIFEST_DIR");
fs::create_dir_all(&parent)?;
let temp_dir = tempdir()?;
let mut linear_genome_index = parent.join("hs_genome");
if let Some(bwa_genome_index) = &self.bwa_index {
linear_genome_index = bwa_genome_index.clone();
println!(
"using input bwa index at: {}",
linear_genome_index.display()
);
} else {
println!("building bwa index at: {}", linear_genome_index.display());
let index = {
Command::new("bwa")
.arg("index")
.arg("-p")
.arg(&linear_genome_index)
.arg("-a")
.arg("bwtsw") .arg(self.genome.clone())
.status()
.expect("failed to execute indexing process")
};
if !index.success() {
panic!(
"bwa index command failed with exit code: {:?}",
index.code(),
);
}
println!("The index was created successfully: {}", index);
println!(
"using input bwa index at: {}",
linear_genome_index.display()
);
}
let mut sample_name = "".to_string();
if let Some(fastq_reads) = &self.reads {
let stem_of_sample_dir = fastq_reads[0].file_stem().unwrap().to_str().unwrap();
let splitted = stem_of_sample_dir.split('_').collect::<Vec<&str>>();
sample_name = splitted[0].to_string();
} else if let Some(bam_input) = &self.bam_input {
let stem_of_sample_dir = bam_input.file_stem().unwrap().to_str().unwrap();
let splitted = stem_of_sample_dir.split('_').collect::<Vec<&str>>();
sample_name = splitted[0].to_string();
} else {
return Err(anyhow::anyhow!("Please provide either fastq reads with --reads or BWA aligned BAM input with --bam-input !"));
}
let file_aligned_sorted = parent.join(format!("{}_sorted.bam", sample_name));
if let Some(bam_input) = &self.bam_input {
println!("Sorting input bam..");
let sort = {
Command::new("samtools")
.arg("sort")
.arg(bam_input)
.arg("-o")
.arg(&file_aligned_sorted)
.arg("-@")
.arg(&self.threads)
.arg("--write-index")
.status()
.expect("failed to execute the sorting process")
};
if !sort.success() {
panic!(
"samtools sort of given bam failed with exit code: {:?}",
sort.code(),
);
}
println!("The sorting was exited with: {}", sort);
println!("{}", file_aligned_sorted.display());
} else {
println!("Aligning provided FASTQ files.");
let file_aligned = temp_dir.path().join(format!("{}.bam", sample_name));
println!("{}", file_aligned.display());
let read_group = format!("@RG\\tID:{}\\tSM:{}", sample_name, sample_name);
let reads = &self.reads.as_ref().unwrap();
let align = {
Command::new("bwa")
.arg("mem")
.arg("-t")
.arg(&self.threads)
.arg("-R")
.arg(&read_group)
.arg(linear_genome_index)
.arg(&reads[0])
.arg(&reads[1])
.arg("-o")
.arg(&file_aligned)
.status()
.expect("failed to execute the alignment process")
};
if !align.success() {
panic!(
"bwa mem alignment failed with exit code: {:?}",
align.code(),
);
}
println!("The alignment was exited with: {}", align);
println!("{}", file_aligned.display());
let sort = {
Command::new("samtools")
.arg("sort")
.arg(file_aligned)
.arg("-o")
.arg(&file_aligned_sorted)
.arg("-@")
.arg(&self.threads)
.arg("--write-index")
.status()
.expect("failed to execute the sorting process")
};
if !sort.success() {
panic!("samtools sort failed with exit code: {:?}", sort.code(),);
}
println!("The sorting was exited with: {}", sort);
println!("{}", file_aligned_sorted.display());
}
let path_idxstats: PathBuf = temp_dir.path().join("stats.txt");
let mut file_idxstats = std::fs::File::create(path_idxstats.clone())?;
let idxstats = {
Command::new("samtools")
.arg("idxstats")
.arg(&file_aligned_sorted)
.output()
.expect("failed to execute idxstat process")
};
file_idxstats.write_all(&idxstats.stdout)?; file_idxstats.flush()?;
let mut chr_naming = &"ensembl";
let mut reader = ReaderBuilder::new()
.delimiter(b'\t')
.from_path(path_idxstats)?;
if let Some(result) = reader.records().next() {
let record = result?;
let record_string = record[0].to_string();
if record_string.starts_with("chr") {
chr_naming = &"ucsc";
}
}
println!("chr_naming format: {}", chr_naming);
let path_to_regions = temp_dir.path().join("regions.bed");
let mut regions_file = std::fs::File::create(&path_to_regions)?;
if chr_naming == &"ucsc" {
let regions_ensembl = "\
chr6\t32659467\t32668383
chr6\t32577902\t32589848
chr6\t32628179\t32647062
chr6\t31268749\t31272130
chr6\t30489509\t30494194
chr6\t29826967\t29831125
chr6\t29722775\t29738528
chr6\t29887752\t29890482
chr6\t29941260\t29949572
chr6\t31353872\t31367067";
regions_file.write_all(regions_ensembl.as_bytes())?;
} else if chr_naming == &"ensembl" {
let regions_ucsc = "\
6\t32659467\t32668383
6\t32577902\t32589848
6\t32628179\t32647062
6\t31268749\t31272130
6\t30489509\t30494194
6\t29826967\t29831125
6\t29722775\t29738528
6\t29887752\t29890482
6\t29941260\t29949572
6\t31353872\t31367067
";
regions_file.write_all(regions_ucsc.as_bytes())?;
}
regions_file.flush()?;
let file_extracted = temp_dir
.path()
.join(format!("{}_extracted.bam", sample_name));
let extract = {
Command::new("samtools")
.arg("view")
.arg(&file_aligned_sorted)
.arg("-L")
.arg(path_to_regions)
.arg("--write-index") .arg("-o")
.arg(&file_extracted)
.status()
.expect("failed to execute the extracting process")
};
if !extract.success() {
panic!(
"extraction of HLA regions from BAM file (samtools view) failed with exit code: {:?}",
extract.code(),
);
}
println!("The extraction was exited with: {}", extract);
let alignment_props_path = temp_dir.path().join(format!("{}.json", sample_name));
println!(
"estimating alignment properties at: {}",
alignment_props_path.display()
);
let varlociraptor_estimate = {
Command::new("varlociraptor")
.arg("estimate")
.arg("alignment-properties")
.arg(&self.genome)
.arg("bams")
.arg(&file_aligned_sorted)
.stdout(Stdio::piped())
.spawn()
.expect("failed to execute the varlociraptor estimate alignment properties")
};
let varlociraptor_estimate_output = varlociraptor_estimate
.wait_with_output()
.expect("Failed to read stdout");
if !varlociraptor_estimate_output.status.success() {
panic!(
"Estimating alignment properties with varlociraptor failed with exit code: {:?}, Error: {}",
varlociraptor_estimate_output.status.code(),
String::from_utf8_lossy(&varlociraptor_estimate_output.stderr)
);
}
let mut alignment_props_file = std::fs::File::create(alignment_props_path.clone())?;
alignment_props_file.write_all(&varlociraptor_estimate_output.stdout)?; alignment_props_file.flush()?;
fs::remove_file(&file_aligned_sorted)?;
let file_aligned_sorted_bai = file_aligned_sorted.with_extension("bam.csi");
if file_aligned_sorted_bai.exists() {
fs::remove_file(&file_aligned_sorted_bai)?;
}
let temp_extracted_fq_1 = temp_dir.path().join(format!("{}_1.fastq", sample_name));
let temp_extracted_fq_2 = temp_dir.path().join(format!("{}_2.fastq", sample_name));
let bam_to_fq = {
Command::new("samtools")
.arg("fastq")
.arg(file_extracted)
.arg("-n") .arg("-1")
.arg(&temp_extracted_fq_1)
.arg("-2")
.arg(&temp_extracted_fq_2)
.status()
.expect("failed to execute the extracting process")
};
if !bam_to_fq.success() {
panic!(
"conversion of bam to fastq (samtools fastq) failed with exit code: {:?}",
bam_to_fq.code(),
);
}
println!("Conversion from BAM to fq was exited with: {}", bam_to_fq);
let file_aligned_pangenome = temp_dir.path().join(format!("{}_vg.bam", sample_name));
let align_pangenome = {
Command::new("vg")
.arg("giraffe")
.arg("-x")
.arg(self.vg_index.clone())
.arg("-f")
.arg(temp_extracted_fq_1)
.arg("-f")
.arg(temp_extracted_fq_2)
.arg("--output-format")
.arg("BAM")
.arg("-t")
.arg(&self.threads)
.stdout(Stdio::piped())
.spawn()
.expect("failed to execute the vg giraffe process")
};
println!(
"Alignment to pangenome was exited with: {:?}",
align_pangenome
);
let output = align_pangenome
.wait_with_output()
.expect("Failed to read stdout");
if !output.status.success() {
panic!(
"vg giraffe failed with exit code: {:?}, Error: {}",
output.status.code(),
String::from_utf8_lossy(&output.stderr)
);
}
let mut vg_bam = std::fs::File::create(file_aligned_pangenome.clone())?;
vg_bam.write_all(&output.stdout)?; vg_bam.flush()?;
let file_vg_aligned_sorted = temp_dir
.path()
.join(format!("{}_vg_sorted.bam", sample_name));
let vg_sort = {
Command::new("samtools")
.arg("sort")
.arg(&file_aligned_pangenome)
.arg("-o")
.arg(&file_vg_aligned_sorted)
.arg("-@")
.arg(&self.threads)
.arg("--write-index")
.status()
.expect("failed to execute the sorting process")
};
if !vg_sort.success() {
panic!(
"samtools sort of vg_aligned bam failed with exit code: {:?}",
vg_sort.code(),
);
}
println!("The sorting was exited with: {}", vg_sort);
println!("{}", file_vg_aligned_sorted.display());
let file_reheadered = temp_dir
.path()
.join(format!("{}_reheadered.bam", sample_name));
println!("{}", file_reheadered.display());
let samtools_view_child = Command::new("samtools")
.arg("view") .arg("-H") .arg(&file_vg_aligned_sorted) .stdout(Stdio::piped())
.spawn()
.unwrap();
let mut regex = &"";
if chr_naming == &"ucsc" {
regex = &"s/GRCh38.//g";
} else if chr_naming == &"ensembl" {
regex = &"s/GRCh38.chr//g";
}
println!("regex for reheader: {}", regex);
let sed_child_one = Command::new("sed")
.arg(regex)
.stdin(Stdio::from(samtools_view_child.stdout.unwrap())) .stdout(Stdio::piped())
.spawn()
.unwrap();
let reheader_child_two = Command::new("samtools")
.arg("reheader")
.arg("-")
.stdin(sed_child_one.stdout.unwrap())
.arg(file_vg_aligned_sorted)
.stdout(Stdio::piped())
.spawn()
.unwrap();
let output = reheader_child_two
.wait_with_output()
.expect("failed to wait on child");
let mut f = std::fs::File::create(file_reheadered.clone())?;
f.write_all(&output.stdout)?;
let samtools_index = {
Command::new("samtools")
.arg("index")
.arg(&file_reheadered)
.status()
.unwrap()
};
if !samtools_index.success() {
panic!(
"Indexing of reheadered bam file failed with exit code: {:?}",
samtools_index.code(),
);
}
println!("The indexing was exited with: {}", samtools_index);
let mut final_bam = temp_dir
.path()
.join(format!("{}_processed.bam", sample_name));
if self.output_bam {
final_bam = parent.join(format!("{}_processed.bam", sample_name));
}
println!("{}", final_bam.display());
let mut chromosomes = vec![];
if chr_naming == &"ucsc" {
chromosomes = vec![
"chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19",
"chr20", "chr21", "chr22", "chrX", "chrY", "chrM",
]
} else if chr_naming == &"ensembl" {
chromosomes = vec![
"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "X", "Y", "MT",
]
}
println!("chromosomes to extract: {:?}", chromosomes);
let samtools_extract = {
Command::new("samtools")
.arg("view")
.arg(&file_reheadered)
.args(chromosomes)
.arg("-o")
.arg(&final_bam)
.arg("-@")
.arg(&self.threads)
.arg("--write-index")
.status()
.expect("failed to execute the sorting process")
};
if !samtools_extract.success() {
panic!(
"Extraction of standard chromosomes (samtools view) failed with exit code: {:?}",
samtools_extract.code(),
);
}
println!(
"The extractiong of standard chromosomes was exited with: {}",
samtools_extract
);
let varlociraptor_prep_dir = temp_dir.path().join(format!("{}_obs.bcf", sample_name));
println!(
"varlociraptor_prep_dir: {}",
varlociraptor_prep_dir.display()
);
let varlociraptor_prep = {
Command::new("varlociraptor")
.arg("preprocess")
.arg("variants")
.arg("--report-fragment-ids")
.arg("--omit-mapq-adjustment")
.arg("--atomic-candidate-variants")
.arg("--candidates")
.arg(&self.haplotype_variants)
.arg("--alignment-properties")
.arg(&alignment_props_path)
.arg(&self.genome)
.arg("--bam")
.arg(&final_bam)
.arg("--output")
.arg(&varlociraptor_prep_dir)
.status()
.expect("failed to execute the varlociraptor preprocessing")
};
if !varlociraptor_prep.success() {
panic!(
"Preprocessing of bam file with varlociraptor failed with exit code: {:?}",
varlociraptor_prep.code(),
);
}
println!(
"The varlociraptor preprocessing was exited with: {}",
varlociraptor_prep
);
let varlociraptor_call_dir = outdir;
println!(
"varlociraptor_call_dir: {}",
varlociraptor_call_dir.display()
);
let scenario_str = include_str!("../../resources/scenarios/scenario.yaml");
let scenario_path = temp_dir.path().join("scenario.yaml");
let mut scenario_file = File::create(&scenario_path)?;
scenario_file.write_all(scenario_str.as_bytes())?;
println!("YAML written to scenario.yaml in temp dir");
println!(
"{}",
format!("sample={}", &varlociraptor_prep_dir.display())
);
let mut cmd: Command = Command::new("varlociraptor");
cmd.arg("call").arg("variants");
if self.omit_homopolymer_artifact_detection {
cmd.arg("--omit-homopolymer-artifact-detection");
}
cmd.arg("generic")
.arg("--obs")
.arg(format!("sample={}", varlociraptor_prep_dir.display()))
.arg("--scenario")
.arg(&scenario_path);
let varlociraptor_call = cmd
.stdout(Stdio::piped())
.spawn()
.expect("failed to execute the varlociraptor calling process");
println!(
"The varlociraptor calling was exited with: {:?}",
varlociraptor_call
);
let var_output = varlociraptor_call
.wait_with_output()
.expect("Varlociraptor: Failed to read stdout");
if !var_output.status.success() {
panic!(
"calling of observation file with varlociraptor failed with exit code: {:?}, Error: {}",
var_output.status.code(),
String::from_utf8_lossy(&var_output.stderr)
);
}
let mut called_file = std::fs::File::create(&varlociraptor_call_dir)?;
called_file.write_all(&var_output.stdout)?; called_file.flush()?;
temp_dir.close()?;
Ok(())
}
}