use std::cmp;
use std::collections::HashMap;
use std::fs;
use std::fs::File;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::str;
use anyhow::Result;
use askama::Template;
use bio::io::fasta;
use bio_types::genome::AbstractLocus;
use derive_builder::Builder;
use regex::Regex;
use rust_htslib::bam::Read as BamRead;
use rust_htslib::{bam, bcf, bcf::Read};
use crate::errors;
use crate::utils;
use crate::utils::anonymize::Anonymizer;
use crate::variants::model::Variant;
use crate::variants::sample;
use crate::variants::types::breakends::BreakendIndex;
use crate::{cli, reference};
lazy_static! {
static ref TESTCASE_RE: Regex =
Regex::new(r"^(?P<chrom>[^:]+):(?P<pos>\d+)(:(?P<idx>\d+))?$").unwrap();
}
#[derive(Template)]
#[template(path = "testcase.yml", escape = "none")]
struct TestcaseTemplate {
samples: HashMap<String, Sample>,
candidate: String,
scenario: Option<String>,
ref_path: String,
mode: Mode,
purity: Option<f64>,
}
#[derive(Debug)]
struct Sample {
path: String,
properties: String,
options: String,
}
#[derive(Debug, Clone, Copy, EnumString, Display)]
pub enum Mode {
TumorNormal,
Generic,
}
#[derive(Builder)]
#[builder(pattern = "owned")]
pub struct Testcase {
#[builder(setter(into))]
prefix: PathBuf,
#[builder(private)]
chrom_name: Option<Vec<u8>>,
#[builder(private)]
pos: Option<u64>,
#[builder(private)]
idx: usize,
#[builder(private)]
reference_buffer: reference::Buffer,
candidates: PathBuf,
#[builder(private)]
bams: HashMap<String, PathBuf>,
scenario: Option<PathBuf>,
#[builder(private)]
options: HashMap<String, String>,
#[builder(default = "None")]
purity: Option<f64>,
mode: Mode,
anonymize: bool,
}
impl TestcaseBuilder {
pub(crate) fn reference(self, path: impl AsRef<Path> + std::fmt::Debug) -> Result<Self> {
Ok(self.reference_buffer(reference::Buffer::new(
fasta::IndexedReader::from_file(&path)?,
1,
)))
}
pub(crate) fn locus(self, locus: &str) -> Result<Self> {
if locus == "all" {
Ok(self.chrom_name(None).pos(None).idx(0))
} else if let Some(captures) = TESTCASE_RE.captures(locus) {
let chrom_name = captures
.name("chrom")
.unwrap()
.as_str()
.as_bytes()
.to_owned();
let mut pos: u64 = captures.name("pos").unwrap().as_str().parse()?;
pos -= 1;
let idx = if let Some(m) = captures.name("idx") {
let idx: usize = m.as_str().parse()?;
idx - 1
} else {
0
};
Ok(self.chrom_name(Some(chrom_name)).pos(Some(pos)).idx(idx))
} else {
Err(errors::Error::InvalidLocus.into())
}
}
pub(crate) fn register_sample(
mut self,
name: &str,
bam: impl AsRef<Path>,
mut options: cli::Varlociraptor,
) -> Result<Self> {
if self.bams.is_none() {
self = self.bams(HashMap::new());
}
self.bams
.as_mut()
.unwrap()
.insert(name.to_owned(), bam.as_ref().to_owned());
if self.options.is_none() {
self = self.options(HashMap::new());
}
if let cli::Varlociraptor::Preprocess {
kind:
cli::PreprocessKind::Variants {
ref mut reference,
ref mut candidates,
ref mut bam,
ref mut output,
..
},
} = options
{
*reference = "?".into();
*candidates = "?".into();
*bam = "?".into();
*output = Some("?".into());
} else {
unreachable!();
}
self.options
.as_mut()
.unwrap()
.insert(name.to_owned(), serde_json::to_string(&options)?);
Ok(self)
}
}
impl Testcase {
fn candidate_reader(&self) -> Result<bcf::Reader> {
Ok(bcf::Reader::from_path(&self.candidates)?)
}
fn variants(&mut self) -> Result<Vec<bcf::Record>> {
let mut candidate_reader = self.candidate_reader()?;
let rid = if let Some(name) = self.chrom_name.as_ref() {
Some(candidate_reader.header().name2rid(name)?)
} else {
None
};
let mut found = vec![];
for res in candidate_reader.records() {
let rec = res?;
if let Some(rec_rid) = rec.rid() {
if let Some(rid) = rid {
if rec_rid == rid && rec.pos() as u64 == self.pos.unwrap() {
found.push(rec);
break;
}
} else {
found.push(rec);
}
}
}
if found.is_empty() {
Err(errors::Error::NoCandidateFound.into())
} else {
if rid.is_some() {
let mut breakend_index = None;
let mut aux_breakends = Vec::new();
for rec in &mut found {
if utils::is_bnd(rec)? {
if let Some(event) = utils::info_tag_event(rec)? {
if breakend_index.is_none() {
breakend_index = Some(BreakendIndex::new(&self.candidates)?);
}
let breakend_index = breakend_index.as_ref().unwrap();
let last_idx = breakend_index.last_record_index(&event).unwrap();
let mut candidate_reader = self.candidate_reader()?;
for (i, res) in candidate_reader.records().enumerate() {
let mut other_rec = res?;
if let Some(other_event) = utils::info_tag_event(&mut other_rec)? {
if event == other_event
&& (other_rec.contig() != rec.contig()
|| other_rec.pos() != rec.pos())
{
aux_breakends.push(other_rec);
}
}
if i == last_idx {
break;
}
}
} else {
info!("Skipping collection of mate breakends because EVENT tag is not specified, which is unsupported at the moment.")
}
}
}
found.extend(aux_breakends);
}
Ok(found)
}
}
pub(crate) fn write(&mut self) -> Result<()> {
let mut anonymizer = Anonymizer::new();
fs::create_dir_all(&self.prefix)?;
let candidate_filename = Path::new("candidates.vcf");
let mut skips = utils::SimpleCounter::default();
let mut candidate = None;
for (i, mut record) in (self.variants()?).into_iter().enumerate() {
let variants = utils::collect_variants(&mut record, false, Some(&mut skips))?;
for variant in variants {
if i == self.idx {
if self.chrom_name.is_none() {
self.chrom_name = Some(
self.candidate_reader()?
.header()
.rid2name(record.rid().unwrap())
.unwrap()
.to_owned(),
);
self.pos = Some(record.pos() as u64);
}
candidate = Some((variant, record));
break;
}
}
}
if candidate.is_none() {
return Err(errors::Error::InvalidIndex.into());
}
let candidate = candidate.unwrap();
let chrom_name = self.chrom_name.as_ref().unwrap();
let pos = self.pos.unwrap();
let ref_name = str::from_utf8(chrom_name)?;
let (start, end) = match candidate {
(Variant::Deletion(l), _) => (pos.saturating_sub(1000), pos + l as u64 + 1000),
(Variant::Insertion(ref seq), _) => {
(pos.saturating_sub(1000), pos + seq.len() as u64 + 1000)
}
(Variant::Snv(_), _) => (pos.saturating_sub(100), pos + 1 + 100),
(Variant::Mnv(ref bases), _) => {
(pos.saturating_sub(100), pos + bases.len() as u64 + 100)
}
(Variant::Breakend { .. }, _) => {
(pos.saturating_sub(1000), pos + 1 + 1000) }
(Variant::Inversion(l), _) => (pos.saturating_sub(1000), pos + l as u64 + 1000),
(Variant::Duplication(l), _) => (pos.saturating_sub(1000), pos + l as u64 + 1000),
(Variant::Replacement { ref ref_allele, .. }, _) => (
pos.saturating_sub(1000),
pos + ref_allele.len() as u64 + 1000,
),
(Variant::None, _) => (pos.saturating_sub(100), pos + 1 + 100),
};
let mut ref_start = start;
let mut ref_end = end;
for path in self.bams.values() {
let mut bam_reader = bam::IndexedReader::from_path(path)?;
let tid = bam_reader.header().tid(chrom_name).unwrap();
bam_reader.fetch((tid, start, end))?;
for res in bam_reader.records() {
let rec = res?;
let seq_len = rec.seq().len() as u64;
ref_start = cmp::min((rec.pos() as u64).saturating_sub(seq_len), ref_start);
ref_end = cmp::max(rec.cigar().end_pos() as u64 + seq_len, ref_end);
}
}
let mut samples = HashMap::new();
for (name, path) in &self.bams {
let properties = sample::estimate_alignment_properties(
path,
false,
&mut self.reference_buffer,
Some(crate::estimation::alignment_properties::NUM_FRAGMENTS),
0.,
)?;
let mut bam_reader = bam::IndexedReader::from_path(path)?;
let filename = Path::new(name).with_extension("bam");
let header = bam::header::Header::from_template(bam_reader.header());
let mut bam_writer =
bam::Writer::from_path(self.prefix.join(&filename), &header, bam::Format::Bam)?;
let tid = bam_reader.header().tid(chrom_name).unwrap();
bam_reader.fetch((tid, start, end))?;
for res in bam_reader.records() {
let mut rec = res?;
rec.set_pos(rec.pos() - ref_start as i64);
rec.set_mpos(rec.mpos() - ref_start as i64);
rec.set_tid(bam_writer.header().tid(chrom_name).unwrap() as i32);
if rec.remove_aux(b"RG").is_err() {
debug!("No RG tag to remove in BAM record.");
}
if self.anonymize {
anonymizer.anonymize_bam_record(&mut rec);
}
bam_writer.write(&rec)?;
}
samples.insert(
name.to_owned(),
Sample {
path: filename.to_str().unwrap().to_owned(),
properties: serde_json::to_string(&properties)?,
options: self.options.get(name).unwrap().to_owned(),
},
);
}
let mut header = bcf::Header::from_template(self.candidate_reader()?.header());
if self.anonymize {
header.remove_generic(b"varlociraptor_preprocess_args");
}
let mut candidate_writer = bcf::Writer::from_path(
self.prefix.join(candidate_filename),
&header,
true,
bcf::Format::Vcf,
)?;
let (_, mut candidate_record) = candidate;
candidate_record.set_pos(candidate_record.pos() - ref_start as i64);
if let Ok(Some(end)) = candidate_record
.info(b"END")
.integer()
.map(|v| v.map(|v| v[0]))
{
candidate_record.push_info_integer(b"END", &[end - ref_start as i32])?;
}
if self.anonymize {
anonymizer.anonymize_bcf_record(&mut candidate_record)?;
}
candidate_writer.write(&candidate_record)?;
let scenario = if let Some(scenario) = self.scenario.as_ref() {
fs::copy(scenario, self.prefix.join("scenario.yaml"))?;
Some("scenario.yaml".to_owned())
} else {
None
};
for seq in self.reference_buffer.sequences() {
if seq.name == ref_name {
ref_end = cmp::min(ref_end, seq.len);
}
}
let mut ref_seq =
self.reference_buffer.seq(ref_name)?[ref_start as usize..ref_end as usize].to_owned();
if self.anonymize {
anonymizer.anonymize_seq(&mut ref_seq);
}
let ref_filename = "ref.fa";
let mut ref_writer = fasta::Writer::to_file(self.prefix.join(ref_filename))?;
ref_writer.write(ref_name, None, &ref_seq)?;
let mut desc = File::create(self.prefix.join("testcase.yaml"))?;
desc.write_all(
TestcaseTemplate {
samples,
candidate: candidate_filename.to_str().unwrap().to_owned(),
ref_path: ref_filename.to_owned(),
scenario,
mode: self.mode,
purity: self.purity,
}
.render()?
.as_bytes(),
)?;
Ok(())
}
}