use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::fs::OpenOptions;
use std::io::{BufReader, Write};
use std::path::Path;
use std::str::FromStr;
use annonars::common::cli::CANONICAL;
use anyhow::Error;
use bio::data_structures::interval_tree::IntervalTree;
use chrono::Utc;
use clap::Parser;
use flate2::Compression;
use flate2::write::GzEncoder;
use futures::TryStreamExt;
use noodles::core::Position;
use noodles::vcf::Header as VcfHeader;
use noodles::vcf::io::reader::Builder as VariantReaderBuilder;
use noodles::vcf::variant::RecordBuf as VcfRecord;
use noodles::vcf::variant::record::AlternateBases as _;
use noodles::vcf::variant::record::info::field::key::{END_POSITION, SV_TYPE};
use noodles::vcf::variant::record::samples::keys::key::{
CONDITIONAL_GENOTYPE_QUALITY, FILTER, GENOTYPE, GENOTYPE_COPY_NUMBER,
};
use noodles::vcf::variant::record_buf::AlternateBases;
use noodles::vcf::variant::record_buf::Samples;
use noodles::vcf::variant::record_buf::info::field;
use noodles::vcf::variant::record_buf::samples::Keys;
use noodles::vcf::variant::record_buf::samples::sample;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use serde::{Deserialize, Serialize};
use strum::{Display, EnumIter, IntoEnumIterator};
use uuid::Uuid;
use crate::common::TsvContigStyle;
use crate::common::noodles::{NoodlesVariantReader, open_variant_reader, open_variant_writer};
use crate::ped::PedigreeByName;
use super::seqvars::{AsyncAnnotatedVariantWriter, binning};
use self::bnd::Breakend;
pub mod csq;
mod maelstrom;
#[derive(Debug, Clone, clap::ValueEnum, PartialEq, Eq)]
pub enum OutputFormat {
Vcf,
Tsv,
}
#[derive(Parser, Debug, Clone)]
#[command(about = "Annotate structural variant VCF files", long_about = None)]
pub struct Args {
#[arg(long, required = true)]
pub assembly: String,
#[arg(long)]
pub path_input_ped: String,
#[arg(long, required = true)]
pub path_input_vcf: Vec<String>,
#[arg(short = 'o', long, required = true)]
pub output: String,
#[arg(long, value_enum, default_value_t = OutputFormat::Vcf)]
pub output_format: OutputFormat,
#[arg(long)]
pub max_var_count: Option<usize>,
#[arg(long)]
pub path_cov_vcf: Vec<String>,
#[arg(long, default_value_t = 0.8)]
pub min_overlap: f32,
#[arg(long, default_value_t = 50)]
pub slack_bnd: i32,
#[arg(long, default_value_t = 50)]
pub slack_ins: i32,
#[arg(long)]
pub rng_seed: Option<u64>,
#[arg(long)]
pub file_date: Option<String>,
#[arg(long, value_enum, default_value_t=TsvContigStyle::Auto)]
pub tsv_contig_style: TsvContigStyle,
}
pub mod vcf_header {
use annonars::common::cli::is_canonical;
use noodles::vcf::header;
use noodles::vcf::header::record::value::Map;
use noodles::vcf::header::record::value::map::format::Number as FormatNumber;
use noodles::vcf::header::record::value::map::info::Number;
use noodles::vcf::header::record::value::map::{Contig, Filter, Format, Info, Other};
use noodles::vcf::variant::record::info::field::key::{
END_CONFIDENCE_INTERVALS, END_POSITION, POSITION_CONFIDENCE_INTERVALS, SV_TYPE,
};
use std::collections::HashMap;
use crate::common::contig::ContigManager;
use crate::ped::{Disease, PedigreeByName, Sex};
use noodles::vcf::variant::record::samples::keys::key::{
CONDITIONAL_GENOTYPE_QUALITY, FILTER, GENOTYPE, GENOTYPE_COPY_NUMBER,
};
use noodles::vcf::{
Header,
header::{Builder, FileFormat},
};
pub(crate) const FILE_FORMAT_MAJOR: u32 = 4;
pub(crate) const FILE_FORMAT_MINOR: u32 = 3;
pub(crate) const SOURCE: &str = "mehari";
pub fn build(
contig_manager: &ContigManager,
pedigree: &PedigreeByName,
date: &str,
header: &Header,
) -> Result<Header, anyhow::Error> {
let builder = add_meta_leading(Header::builder(), date)?;
let builder = add_meta_contigs(builder, contig_manager)?;
let builder = add_meta_alt(builder)?;
let builder = add_meta_info(builder)?;
let builder = add_meta_filter(builder)?;
let builder = add_meta_format(builder)?;
let builder = add_meta_pedigree(builder, pedigree, header)?;
Ok(builder.build())
}
fn add_meta_leading(builder: Builder, date: &str) -> Result<Builder, anyhow::Error> {
Ok(builder
.set_file_format(FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR))
.insert("fileDate".parse()?, header::record::Value::from(date))?
.insert("source".parse()?, header::record::Value::from(SOURCE))?)
}
fn add_meta_contigs(
builder: Builder,
contig_manager: &ContigManager,
) -> Result<Builder, anyhow::Error> {
let mut builder = builder;
for sequence in contig_manager.sequences() {
if is_canonical(sequence.name.as_ref()) {
let contig = Map::<Contig>::builder()
.set_length(sequence.length)
.insert("assembly".parse()?, contig_manager.assembly().to_string())
.insert("accession".parse()?, sequence.refseq_ac.clone())
.build()?;
builder = builder.add_contig(sequence.name.clone(), contig);
}
}
Ok(builder)
}
fn add_meta_alt(builder: Builder) -> Result<Builder, anyhow::Error> {
use noodles::vcf::header::record::value::map::AlternativeAllele;
let del_id = "DEL";
let del_alt = Map::<AlternativeAllele>::new("Deletion");
let del_me_id = "DEL:ME";
let del_me_alt = Map::<AlternativeAllele>::new("Deletion of mobile element");
let ins_id = "INS";
let ins_alt = Map::<AlternativeAllele>::new("Insertion");
let ins_me_id = "INS:ME";
let ins_me_alt = Map::<AlternativeAllele>::new("Insertion of mobile element");
let dup_id = "DUP";
let dup_alt = Map::<AlternativeAllele>::new("Duplication");
let dup_tnd_id = "DUP:TANDEM";
let dup_tnd_alt = Map::<AlternativeAllele>::new("Tandem Duplication");
let dup_dsp_id = "DUP:DISPERSED";
let dup_dsp_alt = Map::<AlternativeAllele>::new("Dispersed Duplication");
let cnv_id = "CNV";
let cnv_alt = Map::<AlternativeAllele>::new("Copy number variation");
let bnd_id = "BND";
let bnd_alt = Map::<AlternativeAllele>::new("Breakend");
Ok(builder
.add_alternative_allele(del_id, del_alt)
.add_alternative_allele(del_me_id, del_me_alt)
.add_alternative_allele(ins_id, ins_alt)
.add_alternative_allele(ins_me_id, ins_me_alt)
.add_alternative_allele(dup_id, dup_alt)
.add_alternative_allele(dup_tnd_id, dup_tnd_alt)
.add_alternative_allele(dup_dsp_id, dup_dsp_alt)
.add_alternative_allele(cnv_id, cnv_alt)
.add_alternative_allele(bnd_id, bnd_alt))
}
fn add_meta_info(builder: Builder) -> Result<Builder, anyhow::Error> {
use header::record::value::map::info::Type;
Ok(builder
.add_info(END_POSITION, Map::<Info>::from(END_POSITION))
.add_info(
POSITION_CONFIDENCE_INTERVALS,
Map::<Info>::from(POSITION_CONFIDENCE_INTERVALS),
)
.add_info(
END_CONFIDENCE_INTERVALS,
Map::<Info>::from(END_CONFIDENCE_INTERVALS),
)
.add_info(
"CARRIERS_HET",
Map::<Info>::new(
Number::Count(1),
Type::Integer,
"Number of heterozygous carriers",
),
)
.add_info(
"CARRIERS_HOM_REF",
Map::<Info>::new(
Number::Count(1),
Type::Integer,
"Number of homozygous reference carriers",
),
)
.add_info(
"CARRIERS_HOM_ALT",
Map::<Info>::new(
Number::Count(1),
Type::Integer,
"Number of homozygous alternative carriers",
),
)
.add_info(
"CARRIERS_HEMI_REF",
Map::<Info>::new(
Number::Count(1),
Type::Integer,
"Number of hemizygous reference carriers",
),
)
.add_info(
"CARRIERS_HEMI_ALT",
Map::<Info>::new(
Number::Count(1),
Type::Integer,
"Number of hemizygous alternative carriers",
),
)
.add_info(
"callers",
Map::<Info>::new(
Number::Unknown,
Type::String,
"Callers that detected the variant",
),
)
.add_info(
"sv_uuid",
Map::<Info>::new(
Number::Unknown,
Type::String,
"Temporary UUID; needed for TSV output",
),
)
.add_info(SV_TYPE, Map::<Info>::from(SV_TYPE)))
}
fn add_meta_filter(builder: Builder) -> Result<Builder, anyhow::Error> {
Ok(builder.add_filter("PASS", Map::<Filter>::new("All filters passed")))
}
fn add_meta_format(builder: Builder) -> Result<Builder, anyhow::Error> {
use header::record::value::map::format::Type;
Ok(builder
.add_format(GENOTYPE, Map::<Format>::from(GENOTYPE))
.add_format(FILTER, Map::<Format>::from(FILTER))
.add_format(
CONDITIONAL_GENOTYPE_QUALITY,
Map::<Format>::from(CONDITIONAL_GENOTYPE_QUALITY),
)
.add_format(
"pec",
Map::<Format>::new(FormatNumber::Count(1), Type::Integer, "Paired-end coverage"),
)
.add_format(
"pev",
Map::<Format>::new(
FormatNumber::Count(1),
Type::Integer,
"Paired-end variant support",
),
)
.add_format(
"src",
Map::<Format>::new(FormatNumber::Count(1), Type::Integer, "Split-end coverage"),
)
.add_format(
"srv",
Map::<Format>::new(
FormatNumber::Count(1),
Type::Integer,
"Split-end variant support",
),
)
.add_format(
"amq",
Map::<Format>::new(
FormatNumber::Count(1),
Type::Integer,
"Average mapping quality",
),
)
.add_format(
GENOTYPE_COPY_NUMBER,
Map::<Format>::from(GENOTYPE_COPY_NUMBER),
)
.add_format(
"anc",
Map::<Format>::new(
FormatNumber::Count(1),
Type::Integer,
"Average normalized coverage",
),
)
.add_format(
"pc",
Map::<Format>::new(
FormatNumber::Count(1),
Type::Integer,
"Point count (windows/targets/probes)",
),
))
}
pub fn sex_str(sex: Sex) -> String {
match sex {
Sex::Male => String::from("Male"),
Sex::Female => String::from("Female"),
Sex::Unknown => String::from("Unknown"),
}
}
pub fn disease_str(disease: Disease) -> String {
match disease {
Disease::Affected => String::from("Affected"),
Disease::Unaffected => String::from("Unaffected"),
Disease::Unknown => String::from("Unknown"),
}
}
fn add_meta_pedigree(
builder: Builder,
pedigree: &PedigreeByName,
header: &Header,
) -> Result<Builder, anyhow::Error> {
let mut builder = add_meta_fields(builder)?;
let individuals = pedigree
.individuals
.values()
.map(|i| (i.name.clone(), i))
.collect::<HashMap<_, _>>();
for sample in header.sample_names() {
let individual = individuals.get(sample);
if individual.is_none() {
tracing::debug!("Sample {} not part of the pedigree, skipping.", sample);
continue;
}
let i = individual.unwrap();
builder = builder.add_sample_name(i.name.clone());
builder = builder.insert(
"SAMPLE".parse()?,
noodles::vcf::header::record::Value::Map(
i.name.clone(),
Map::<Other>::builder()
.insert("Sex".parse()?, sex_str(i.sex))
.insert("Disease".parse()?, disease_str(i.disease))
.build()?,
),
)?;
let mut map_builder = Map::<Other>::builder();
if let Some(father) = i.father.as_ref() {
map_builder = map_builder.insert("Father".parse()?, father.clone());
}
if let Some(mother) = i.mother.as_ref() {
map_builder = map_builder.insert("Mother".parse()?, mother.clone());
}
builder = builder.insert(
"PEDIGREE".parse()?,
noodles::vcf::header::record::Value::Map(i.name.clone(), map_builder.build()?),
)?;
}
Ok(builder)
}
fn add_meta_fields(builder: Builder) -> Result<Builder, anyhow::Error> {
Ok(builder
.insert(
"META".parse()?,
noodles::vcf::header::record::Value::Map(
String::from("Sex"),
Map::<Other>::builder()
.insert("Type".parse()?, "String")
.insert("Number".parse()?, ".")
.insert("Values".parse()?, "[Male, Female, Other, Unknown]")
.build()?,
),
)?
.insert(
"META".parse()?,
noodles::vcf::header::record::Value::Map(
String::from("GonosomalKaryotype"),
Map::<Other>::builder()
.insert("Type".parse()?, "String")
.insert("Number".parse()?, ".")
.insert(
"Values".parse()?,
"[XX, XY, XO, XXY, XXX, XYY, Other, Unknown]",
)
.build()?,
),
)?
.insert(
"META".parse()?,
noodles::vcf::header::record::Value::Map(
String::from("Affected"),
Map::<Other>::builder()
.insert("Type".parse()?, "String")
.insert("Number".parse()?, ".")
.insert("Values".parse()?, "[Yes, No, Unknown]")
.build()?,
),
)?)
}
}
use crate::common::contig::ContigManager;
struct VarFishStrucvarTsvWriter {
inner: Box<dyn Write>,
}
#[derive(Debug, Default, Serialize, Deserialize, PartialEq, Clone)]
pub struct GenotypeInfo {
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gt: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ft: Option<Vec<String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub gq: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pec: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pev: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub src: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub srv: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub amq: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cn: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub anc: Option<f32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pc: Option<i32>,
}
#[derive(Debug, Default, PartialEq, Serialize, Deserialize, Clone)]
pub struct GenotypeCalls {
pub entries: Vec<GenotypeInfo>,
}
impl GenotypeCalls {
pub fn for_tsv(&self) -> String {
let mut result = String::new();
result.push('{');
let mut first = true;
for entry in &self.entries {
if first {
first = false;
} else {
result.push(',');
}
result.push_str(&format!("\"\"\"{}\"\"\":{{", entry.name));
let mut prev = false;
if let Some(gt) = &entry.gt {
prev = true;
result.push_str(&format!("\"\"\"gt\"\"\":\"\"\"{}\"\"\"", gt));
}
if let Some(ft) = &entry.ft {
if prev {
result.push(',');
}
prev = true;
result.push_str("\"\"\"ft\"\"\":[");
let mut first_ft = true;
for ft_entry in ft {
if first_ft {
first_ft = false;
} else {
result.push(',');
}
result.push_str(&format!("\"\"\"{}\"\"\"", ft_entry));
}
result.push(']');
}
if let Some(gq) = &entry.gq {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"gq\"\"\":{}", gq));
}
if let Some(pec) = &entry.pec {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"pec\"\"\":{}", pec));
}
if let Some(pev) = &entry.pev {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"pev\"\"\":{}", pev));
}
if let Some(src) = &entry.src {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"src\"\"\":{}", src));
}
if let Some(srv) = &entry.srv {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"srv\"\"\":{}", srv));
}
if let Some(amq) = &entry.amq {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"amq\"\"\":{}", amq));
}
if let Some(cn) = &entry.cn {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"cn\"\"\":{}", cn));
}
if let Some(anc) = &entry.anc {
if prev {
result.push(',');
}
prev = true;
result.push_str(&format!("\"\"\"anc\"\"\":{}", anc));
}
if let Some(pc) = &entry.pc {
if prev {
result.push(',');
}
result.push_str(&format!("\"\"\"pc\"\"\":{}", pc));
}
result.push('}');
}
result.push('}');
result
}
}
impl VarFishStrucvarTsvWriter {
pub fn with_path<P>(p: P) -> anyhow::Result<Self>
where
P: AsRef<Path>,
{
Ok(Self {
inner: if p.as_ref().extension().unwrap_or_default() == "gz" {
Box::new(GzEncoder::new(File::create(p)?, Compression::default()))
} else {
Box::new(File::create(p)?)
},
})
}
pub fn write_header(&mut self) -> Result<(), Error> {
let header = &[
"release",
"chromosome",
"chromosome_no",
"bin",
"chromosome2",
"chromosome_no2",
"bin2",
"pe_orientation",
"start",
"end",
"start_ci_left",
"start_ci_right",
"end_ci_left",
"end_ci_right",
"case_id",
"set_id",
"sv_uuid",
"caller",
"sv_type",
"sv_sub_type",
"info",
"num_hom_alt",
"num_hom_ref",
"num_het",
"num_hemi_alt",
"num_hemi_ref",
"genotype",
];
writeln!(self.inner, "{}", header.join("\t"))
.map_err(|e| anyhow::anyhow!("Error writing VarFish TSV header: {}", e))
}
pub fn write_record(&mut self, tsv_record: &VarFishStrucvarTsvRecord) -> Result<(), Error> {
writeln!(
self.inner,
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{{}}\t{}\t{}\t{}\t{}\t{}\t{}",
&tsv_record.release,
&tsv_record.chromosome,
&tsv_record.chromosome_no,
&tsv_record.bin,
&tsv_record.chromosome2,
&tsv_record.chromosome_no2,
&tsv_record.bin2,
&tsv_record.pe_orientation,
&tsv_record.start,
&tsv_record.end,
&tsv_record.start_ci_left,
&tsv_record.start_ci_right,
&tsv_record.end_ci_left,
&tsv_record.end_ci_right,
&tsv_record.case_id,
&tsv_record.set_id,
&tsv_record.sv_uuid,
tsv_record.callers.join(";"),
&tsv_record.sv_type,
&tsv_record.sv_sub_type,
tsv_record.num_hom_alt,
tsv_record.num_hom_ref,
tsv_record.num_het,
tsv_record.num_hemi_alt,
tsv_record.num_hemi_ref,
&tsv_record.genotype.for_tsv(),
).map_err(|e| anyhow::anyhow!("Error writing VarFish TSV record: {}", e))
}
pub fn flush(&mut self) -> Result<(), anyhow::Error> {
self.inner.flush()?;
Ok(())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display, Default, Serialize, Deserialize)]
pub enum PeOrientation {
#[strum(serialize = "3to3")]
ThreeToThree,
#[strum(serialize = "5to5")]
FiveToFive,
#[strum(serialize = "3to5")]
ThreeToFive,
#[strum(serialize = "5to3")]
FiveToThree,
#[strum(serialize = "NtoN")]
#[default]
Other,
}
impl From<SvType> for PeOrientation {
fn from(sv_type: SvType) -> Self {
match sv_type {
SvType::Del => Self::ThreeToFive,
SvType::Dup => Self::ThreeToThree,
SvType::Inv => Self::FiveToFive,
SvType::Ins | SvType::Cnv | SvType::Bnd => Self::Other,
}
}
}
impl FromStr for PeOrientation {
type Err = crate::errors::StrucvarsError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"3to3" => Ok(PeOrientation::ThreeToThree),
"5to5" => Ok(PeOrientation::FiveToFive),
"3to5" => Ok(PeOrientation::ThreeToFive),
"5to3" => Ok(PeOrientation::FiveToThree),
"NtoN" => Ok(PeOrientation::Other),
_ => Err(crate::errors::StrucvarsError::InvalidPeOrientation(
s.to_string(),
)),
}
}
}
#[derive(
EnumIter,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Debug,
Clone,
Copy,
Default,
Display,
Serialize,
Deserialize,
)]
pub enum SvType {
#[strum(serialize = "DEL")]
#[default]
Del,
#[strum(serialize = "DUP")]
Dup,
#[strum(serialize = "INV")]
Inv,
#[strum(serialize = "INS")]
Ins,
#[strum(serialize = "BND")]
Bnd,
#[strum(serialize = "CNV")]
Cnv,
}
impl FromStr for SvType {
type Err = crate::errors::StrucvarsError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"DEL" => Ok(SvType::Del),
"DUP" => Ok(SvType::Dup),
"INV" => Ok(SvType::Inv),
"INS" => Ok(SvType::Ins),
"BND" => Ok(SvType::Bnd),
"CNV" => Ok(SvType::Cnv),
_ => Err(crate::errors::StrucvarsError::InvalidSvType(s.to_string())),
}
}
}
#[derive(
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Debug,
Clone,
Copy,
Default,
Display,
Serialize,
Deserialize,
)]
pub enum SvSubType {
#[strum(serialize = "DEL")]
#[default]
Del,
#[strum(serialize = "DEL:ME")]
DelMe,
#[strum(serialize = "DEL:ME:SVA")]
DelMeSva,
#[strum(serialize = "DEL:ME:L1")]
DelMeL1,
#[strum(serialize = "DEL:ME:ALU")]
DelMeAlu,
#[strum(serialize = "DUP")]
Dup,
#[strum(serialize = "DUP:TANDEM")]
DupTandem,
#[strum(serialize = "INV")]
Inv,
#[strum(serialize = "INS")]
Ins,
#[strum(serialize = "INS:ME")]
InsMe,
#[strum(serialize = "INS:ME:SVA")]
InsMeSva,
#[strum(serialize = "INS:ME:L1")]
InsMeL1,
#[strum(serialize = "INS:ME:ALU")]
InsMeAlu,
#[strum(serialize = "BND")]
Bnd,
#[strum(serialize = "CNV")]
Cnv,
}
impl From<SvSubType> for SvType {
fn from(other: SvSubType) -> SvType {
match other {
SvSubType::Del
| SvSubType::DelMe
| SvSubType::DelMeSva
| SvSubType::DelMeL1
| SvSubType::DelMeAlu => SvType::Del,
SvSubType::Dup | SvSubType::DupTandem => SvType::Dup,
SvSubType::Inv => SvType::Inv,
SvSubType::Ins
| SvSubType::InsMe
| SvSubType::InsMeSva
| SvSubType::InsMeL1
| SvSubType::InsMeAlu => SvType::Ins,
SvSubType::Bnd => SvType::Bnd,
SvSubType::Cnv => SvType::Cnv,
}
}
}
impl FromStr for SvSubType {
type Err = crate::errors::StrucvarsError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"DEL" => Ok(SvSubType::Del),
"DEL:ME" => Ok(SvSubType::DelMe),
"DEL:ME:SVA" => Ok(SvSubType::DelMeSva),
"DEL:ME:L1" => Ok(SvSubType::DelMeL1),
"DEL:ME:LINE1" => Ok(SvSubType::DelMeL1),
"DEL:ME:ALU" => Ok(SvSubType::DelMeAlu),
"DUP" => Ok(SvSubType::Dup),
"DUP:TANDEM" => Ok(SvSubType::DupTandem),
"INV" => Ok(SvSubType::Inv),
"INS" => Ok(SvSubType::Ins),
"INS:ME" => Ok(SvSubType::InsMe),
"INS:ME:SVA" => Ok(SvSubType::InsMeSva),
"INS:ME:L1" => Ok(SvSubType::InsMeL1),
"INS:ME:LINE1" => Ok(SvSubType::InsMeL1),
"INS:ME:ALU" => Ok(SvSubType::InsMeAlu),
"BND" => Ok(SvSubType::Bnd),
"CNV" => Ok(SvSubType::Cnv),
"ALU" => Ok(SvSubType::InsMeAlu),
"SVA" => Ok(SvSubType::InsMeSva),
"LINE1" => Ok(SvSubType::InsMeL1),
_ => Err(crate::errors::StrucvarsError::InvalidSvSubType(
s.to_string(),
)),
}
}
}
#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
pub struct InfoRecord {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub alt: Option<String>,
}
#[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq)]
pub struct VarFishStrucvarTsvRecord {
pub release: String,
pub chromosome: String,
pub chromosome_no: u32,
pub bin: u32,
pub chromosome2: String,
pub chromosome_no2: u32,
pub bin2: u32,
pub pe_orientation: PeOrientation,
pub start: i32,
pub end: i32,
pub start_ci_left: i32,
pub start_ci_right: i32,
pub end_ci_left: i32,
pub end_ci_right: i32,
pub case_id: usize,
pub set_id: usize,
pub sv_uuid: Uuid,
pub callers: Vec<String>,
pub sv_type: SvType,
pub sv_sub_type: SvSubType,
pub info: InfoRecord,
pub num_hom_alt: i32,
pub num_hom_ref: i32,
pub num_het: i32,
pub num_hemi_alt: i32,
pub num_hemi_ref: i32,
pub genotype: GenotypeCalls,
}
impl VarFishStrucvarTsvRecord {
pub fn overlap(&self, other: &Self) -> f32 {
let s1 = if self.start > 0 { self.start - 1 } else { 0 };
let e1 = self.end + 1;
let s2 = if other.start > 0 { other.start - 1 } else { 0 };
let e2 = other.end;
let ovl_s = std::cmp::max(s1, s2);
let ovl_e = std::cmp::min(e1, e2);
if ovl_e <= ovl_s {
0.0
} else {
let len1 = (e1 - s1) as f32;
let len2 = (e2 - s2) as f32;
let ovl_len = (ovl_e - ovl_s) as f32;
(ovl_len / len1).min(ovl_len / len2)
}
}
fn merge(&mut self, other: &VarFishStrucvarTsvRecord) {
assert_eq!(self.sv_type, other.sv_type);
assert_eq!(self.genotype.entries.len(), other.genotype.entries.len());
self.callers.extend_from_slice(&other.callers);
self.callers.sort();
self.callers.dedup();
if other.num_hom_alt + other.num_het + other.num_hemi_alt
> self.num_hom_alt + self.num_het + self.num_hemi_alt
{
self.num_hom_alt = other.num_hom_alt;
self.num_hom_ref = other.num_hom_ref;
self.num_het = other.num_het;
self.num_hemi_alt = other.num_hemi_alt;
self.num_hemi_ref = other.num_hemi_ref;
}
for i in 0..self.genotype.entries.len() {
let lhs = self
.genotype
.entries
.get_mut(i)
.expect("vecs have same size");
let rhs = other.genotype.entries.get(i).expect("vecs have same size");
if let Some(lhs_gq) = lhs.gq {
if let Some(rhs_gq) = rhs.gq
&& lhs_gq < rhs_gq
{
lhs.gq = rhs.gq;
}
} else {
lhs.gq = rhs.gq;
}
if let (Some(_), Some(lhs_pev)) = (lhs.pec, lhs.pev) {
if let (Some(_), Some(rhs_pev)) = (rhs.pec, rhs.pev)
&& lhs_pev < rhs_pev
{
lhs.pec = rhs.pec;
lhs.pev = rhs.pev;
}
} else {
lhs.pec = rhs.pec;
lhs.pev = rhs.pev;
}
if let (Some(_), Some(lhs_srv)) = (lhs.src, lhs.srv) {
if let (Some(_), Some(rhs_srv)) = (rhs.src, rhs.srv)
&& lhs_srv < rhs_srv
{
lhs.src = rhs.src;
lhs.srv = rhs.srv;
}
} else {
lhs.src = rhs.src;
lhs.srv = rhs.srv;
}
}
}
}
impl TryInto<VcfRecord> for VarFishStrucvarTsvRecord {
type Error = anyhow::Error;
fn try_into(self) -> Result<VcfRecord, Self::Error> {
use noodles::vcf::variant::record_buf::Info;
use noodles::vcf::variant::record_buf::info::field::Value;
use noodles::vcf::variant::record_buf::info::field::value::Array;
let mut genotypes = Vec::new();
for genotype in &self.genotype.entries {
genotypes.push(vec![
genotype
.gt
.as_ref()
.map(|gt| sample::Value::String(gt.clone())),
genotype.ft.as_ref().map(|ft| {
sample::Value::Array(sample::value::Array::String(
ft.iter().map(|s| Some(s.clone())).collect(),
))
}),
genotype.gq.as_ref().map(|gq| sample::Value::Integer(*gq)),
genotype
.pec
.as_ref()
.map(|pec| sample::Value::Integer(*pec)),
genotype
.pev
.as_ref()
.map(|pev| sample::Value::Integer(*pev)),
genotype
.src
.as_ref()
.map(|src| sample::Value::Integer(*src)),
genotype
.srv
.as_ref()
.map(|srv| sample::Value::Integer(*srv)),
genotype
.amq
.as_ref()
.map(|amq| sample::Value::Integer(*amq)),
genotype.cn.as_ref().map(|cn| sample::Value::Float(*cn)),
genotype.anc.as_ref().map(|anc| sample::Value::Float(*anc)),
genotype.pc.as_ref().map(|pc| sample::Value::Integer(*pc)),
]);
}
let keys: Keys = [
GENOTYPE,
FILTER,
CONDITIONAL_GENOTYPE_QUALITY,
"pec",
"pev",
"src",
"srv",
"amq",
GENOTYPE_COPY_NUMBER,
"anc",
"pc",
]
.into_iter()
.map(String::from)
.collect();
let samples = Samples::new(keys, genotypes);
let info = vec![
("END".to_string(), Some(Value::Integer(self.end))),
(
"sv_uuid".to_string(),
Some(Value::String(format!("{}", self.sv_uuid))),
),
(
"callers".to_string(),
Some(Value::Array(Array::String(
self.callers.iter().map(|s| Some(s.to_string())).collect(),
))),
),
(
"SVTYPE".to_string(),
Some(Value::String(format!("{}", self.sv_sub_type))),
),
];
let mut info: Info = info.into_iter().collect();
info.insert("CARRIERS_HET".into(), Some(Value::Integer(self.num_het)));
info.insert(
"CARRIERS_HOM_REF".into(),
Some(Value::Integer(self.num_hom_ref)),
);
info.insert(
"CARRIERS_HOM_ALT".into(),
Some(Value::Integer(self.num_hom_alt)),
);
info.insert(
"CARRIERS_HEMI_REF".into(),
Some(Value::Integer(self.num_hemi_ref)),
);
info.insert(
"CARRIERS_HEMI_ALT".into(),
Some(Value::Integer(self.num_hemi_alt)),
);
let builder = noodles::vcf::variant::RecordBuf::builder()
.set_reference_sequence_name(self.chromosome.clone())
.set_variant_start(Position::try_from(self.start as usize)?)
.set_reference_bases("N")
.set_info(info)
.set_samples(samples);
let builder = if self.sv_sub_type == SvSubType::Bnd {
builder.set_alternate_bases(AlternateBases::from(vec![self.info.alt.unwrap()]))
} else {
builder.set_alternate_bases(AlternateBases::from(vec![format!(
"<{}>",
self.sv_sub_type
)]))
};
let res = builder.build();
Ok(res)
}
}
#[derive(Debug, Clone, PartialEq, EnumIter, Serialize, Deserialize)]
pub enum SvCaller {
ClinCnv { version: String },
Delly { version: String },
DragenSv { version: String },
DragenCnv { version: String },
Gcnv { version: String },
Manta { version: String },
Melt { version: String },
Popdel { version: String },
Sniffles2 { version: String },
}
impl SvCaller {
pub fn caller_compatible(&self, header: &VcfHeader) -> bool {
match self {
SvCaller::ClinCnv { .. } => {
self.all_format_defined(header, &["GT", "CN", "GQ", "NP"])
&& self.all_info_defined(header, &["END", "SVTYPE", "SVLEN"])
}
SvCaller::Delly { .. } => {
self.all_format_defined(header, &["RC", "RCL", "RCR"])
&& self.all_info_defined(header, &["CHR2", "INSLEN", "HOMLEN"])
}
SvCaller::DragenSv { .. } => {
self.all_format_defined(header, &["PL", "PR", "SR"])
&& self.all_info_defined(header, &["BND_DEPTH", "MATE_BND_DEPTH"])
&& self.source_starts_with(header, "DRAGEN")
}
SvCaller::Manta { .. } => {
self.all_format_defined(header, &["PL", "PR", "SR"])
&& self.all_info_defined(header, &["BND_DEPTH", "MATE_BND_DEPTH"])
&& self.source_starts_with(header, "GenerateSVCandidates")
}
SvCaller::Melt { .. } => {
self.all_format_defined(header, &["GL", "DP", "AD"])
&& self.all_info_defined(header, &["ASSESS", "TSD"])
&& self.source_starts_with(header, "MELTv")
}
SvCaller::DragenCnv { .. } => {
self.all_format_defined(header, &["SM", "CN", "BC", "PE"])
&& self.all_info_defined(header, &["REFLEN", "CIPOS", "CIEND"])
}
SvCaller::Gcnv { .. } => {
self.all_format_defined(header, &["QA", "QS", "QSE", "QSS"])
&& self.all_info_defined(header, &["END", "AC_Orig", "AN_Orig"])
}
SvCaller::Popdel { .. } => {
self.all_format_defined(header, &["LAD", "DAD", "FL"])
&& self.all_info_defined(header, &["AF", "IMPRECISE", "SVLEN"])
}
SvCaller::Sniffles2 { .. } => {
self.all_format_defined(header, &["GT", "DR", "DV"])
&& self.all_info_defined(header, &["MOSAIC", "CONSENSUS_SUPPORT", "SUPP_VEC"])
&& self.source_starts_with(header, "Sniffles2")
}
}
}
fn extract_version(
&self,
header: &VcfHeader,
record: &VcfRecord,
) -> Result<Self, anyhow::Error> {
Ok(match self {
SvCaller::ClinCnv { .. } => SvCaller::ClinCnv {
version: self
.version_from_source_trailing(header, ' ')?
.replace('v', ""),
},
SvCaller::Delly { .. } => SvCaller::Delly {
version: self.version_from_info_svmethod(record)?,
},
SvCaller::DragenSv { .. } => SvCaller::DragenSv {
version: self.version_from_source_trailing(header, ' ')?,
},
SvCaller::Gcnv { .. } => SvCaller::Gcnv {
version: self.version_from_mapping_key(header, "GATKCommandLine")?,
},
SvCaller::DragenCnv { .. } => {
let raw_version = self.version_from_mapping_key(header, "DRAGENVersion")?;
let mut version = raw_version
.split(' ')
.nth(1)
.expect("Problem extracting version from DRAGENVersion")
.chars();
version.next_back();
let version = version.as_str().to_owned();
SvCaller::DragenCnv { version }
}
SvCaller::Manta { .. } => SvCaller::Manta {
version: self.version_from_source_trailing(header, ' ')?,
},
SvCaller::Melt { .. } => {
for (key, values) in header.other_records() {
if key.as_ref() == "source"
&& let noodles::vcf::header::record::value::Collection::Unstructured(inner) =
values
&& let Some(version) = inner[0].split('v').next_back()
{
return Ok(SvCaller::Melt {
version: version.to_string(),
});
}
}
anyhow::bail!("Could not extract version for MELT")
}
SvCaller::Popdel { .. } => SvCaller::Popdel {
version: self.version_from_info_svmethod(record)?,
},
SvCaller::Sniffles2 { .. } => SvCaller::Sniffles2 {
version: self.version_from_source_trailing(header, '_')?,
},
})
}
fn version_from_info_svmethod(&self, record: &VcfRecord) -> Result<String, anyhow::Error> {
let value = record
.info()
.get("SVMETHOD")
.ok_or(anyhow::anyhow!("Problem with INFO/SVMETHOD field"))?
.ok_or(anyhow::anyhow!("Problem with INFO/SVMETHOD INFO field"))?;
if let field::Value::String(value) = value {
Ok(value.split('v').next_back().unwrap().to_string())
} else {
anyhow::bail!("Problem with INFO/SVMETHOD INFO field")
}
}
fn version_from_source_trailing(
&self,
header: &VcfHeader,
splitter: char,
) -> Result<String, anyhow::Error> {
for (key, values) in header.other_records() {
if key.as_ref() == "source"
&& let noodles::vcf::header::record::value::Collection::Unstructured(inner) = values
&& let Some(version) = inner[0].split(splitter).next_back()
{
return Ok(version.to_string());
}
}
anyhow::bail!("Could not extract ##source header")
}
fn version_from_mapping_key(
&self,
header: &VcfHeader,
row_key: &str,
) -> Result<String, anyhow::Error> {
for (key, values) in header.other_records() {
if key.as_ref() == row_key
&& let noodles::vcf::header::record::value::Collection::Structured(inner) = values
{
for (_, inner2) in inner.iter() {
if let Some(version) = inner2.other_fields().get("Version") {
return Ok(version.to_string());
}
}
}
}
anyhow::bail!("Could not extract ##{} header", row_key)
}
fn all_format_defined(&self, header: &VcfHeader, names: &[&str]) -> bool {
let mut missing = names.iter().map(|s| s.to_string()).collect::<HashSet<_>>();
for (key, _) in header.formats() {
missing.remove(key);
}
missing.is_empty()
}
fn all_info_defined(&self, header: &VcfHeader, names: &[&str]) -> bool {
let mut missing = names.iter().map(|s| s.to_string()).collect::<HashSet<_>>();
for (key, _) in header.infos() {
missing.remove(key);
}
missing.is_empty()
}
fn source_starts_with(&self, header: &VcfHeader, prefix: &str) -> bool {
for (key, values) in header.other_records() {
if key.as_ref() == "source"
&& let noodles::vcf::header::record::value::Collection::Unstructured(inner) = values
&& inner[0].starts_with(prefix)
{
return true;
}
}
false
}
}
pub async fn guess_sv_caller(
reader: &mut impl NoodlesVariantReader,
) -> Result<SvCaller, anyhow::Error> {
let header = reader.read_header().await?;
let mut records = reader.records(&header).await;
let record = records
.try_next()
.await
.map_err(|e| anyhow::anyhow!("Problem reading VCF records: {}", e))?
.ok_or(anyhow::anyhow!("No records found"))?;
for caller in SvCaller::iter() {
if caller.caller_compatible(&header) {
return caller.extract_version(&header, &record);
}
}
anyhow::bail!("Could not guess SV caller from VCF header and first record.");
}
pub trait VcfRecordConverter {
fn assembly(&self) -> &str;
fn tsv_contig_style(&self) -> TsvContigStyle;
fn contig_manager(&self) -> &ContigManager;
fn convert(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
uuid: Uuid,
) -> Result<VarFishStrucvarTsvRecord, anyhow::Error> {
let mut tsv_record = VarFishStrucvarTsvRecord::default();
self.fill_sv_type(vcf_record, &mut tsv_record)?;
self.fill_coords(vcf_record, &mut tsv_record)?;
self.fill_callers_uuid(uuid, &mut tsv_record)?;
self.fill_genotypes(pedigree, vcf_record, &mut tsv_record)?;
self.fill_cis(vcf_record, &mut tsv_record)?;
self.fill_info(vcf_record, &mut tsv_record)?;
Ok(tsv_record)
}
fn fill_info(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
if let Some(alt) = vcf_record.alternate_bases().as_ref().first() {
let ref_allele = vcf_record.reference_bases().to_string();
let alt_allele = alt.clone();
if Breakend::from_ref_alt_str(&ref_allele, &alt_allele).is_ok() {
tsv_record.info.alt = Some(alt_allele);
}
}
Ok(())
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error>;
fn fill_sv_type(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let sv_type = vcf_record
.info()
.get(SV_TYPE)
.ok_or_else(|| anyhow::anyhow!("SVTYPE not found"))?
.ok_or_else(|| anyhow::anyhow!("SVTYPE empty"))?;
if let field::Value::String(value) = sv_type {
tsv_record.sv_sub_type = SvSubType::from_str(value)?;
} else {
anyhow::bail!("SVTYPE is not a string");
}
tsv_record.sv_type = tsv_record.sv_sub_type.into();
tsv_record.pe_orientation = tsv_record.sv_type.into();
Ok(())
}
fn fill_coords(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let assembly = self.assembly();
let contig_manager = self.contig_manager();
tsv_record.release = assembly.to_string();
let contig_name = vcf_record.reference_sequence_name();
if let Some(contig_info) = contig_manager.get_contig_info(contig_name) {
tsv_record.chromosome_no = contig_info.chrom_no;
tsv_record.chromosome = match self.tsv_contig_style() {
TsvContigStyle::Passthrough => contig_name.to_string(),
TsvContigStyle::WithChr => contig_info.name_with_chr,
TsvContigStyle::WithoutChr => contig_info.name_without_chr,
TsvContigStyle::Auto => {
if assembly == "GRCh38" {
if ContigManager::is_mitochondrial(contig_info.chrom_no) {
"chrM".into()
} else {
contig_info.name_with_chr
}
} else {
contig_info.name_without_chr
}
}
};
} else {
return Err(anyhow::anyhow!(
"Contig {} not found in assembly",
contig_name
));
}
let start: usize = vcf_record
.variant_start()
.expect("Telomeric breakend not supported")
.get();
tsv_record.start = start as i32;
let mut end: Option<i32> = None;
let alleles = vcf_record.alternate_bases().as_ref();
if alleles.len() != 1 {
panic!(
"Only one alternative allele is supported for SVs, got {} alternative alleles ({:?})",
alleles.len(),
alleles
);
}
let allele = &alleles[0];
if allele.contains('[') || allele.contains(']') {
let reference = vcf_record.reference_bases().to_string();
let bnd = Breakend::from_ref_alt_str(&reference, allele)?;
let contig2_name = bnd.chrom;
if let Some(contig_info) = contig_manager.get_contig_info(&contig2_name) {
tsv_record.chromosome_no2 = contig_info.chrom_no;
tsv_record.chromosome2 = match self.tsv_contig_style() {
TsvContigStyle::Passthrough => contig2_name.to_string(),
TsvContigStyle::WithChr => contig_info.name_with_chr,
TsvContigStyle::WithoutChr => contig_info.name_without_chr,
TsvContigStyle::Auto => {
if assembly.eq_ignore_ascii_case("grch38") {
if ContigManager::is_mitochondrial(contig_info.chrom_no) {
"chrM".into()
} else {
contig_info.name_with_chr
}
} else {
contig_info.name_without_chr
}
}
};
} else {
tsv_record.chromosome2 = contig2_name.to_string();
tsv_record.chromosome_no2 = 0;
}
end = Some(bnd.pos);
tsv_record.pe_orientation = bnd.pe_orientation;
} else {
tsv_record.chromosome2.clone_from(&tsv_record.chromosome);
tsv_record.chromosome_no2 = tsv_record.chromosome_no;
}
tsv_record.end = if let Some(end) = end {
end
} else {
let tmp_end = vcf_record
.info()
.get(END_POSITION)
.map(|end| {
end.map(|end| match end {
field::Value::Integer(value) => Ok(Some(*value)),
_ => anyhow::bail!("END is not an integer"),
})
})
.unwrap_or_default()
.transpose()?
.unwrap_or_default();
if let Some(end) = tmp_end {
end
} else {
tsv_record.start
}
};
if tsv_record.sv_type == SvType::Bnd {
tsv_record.bin =
binning::bin_from_range(tsv_record.start - 1, tsv_record.start)? as u32;
tsv_record.bin2 = binning::bin_from_range(tsv_record.end - 1, tsv_record.end)? as u32;
} else {
if tsv_record.sv_type == SvType::Ins {
tsv_record.bin =
binning::bin_from_range(tsv_record.start - 1, tsv_record.start)? as u32;
} else {
tsv_record.bin =
binning::bin_from_range(tsv_record.start - 1, tsv_record.end)? as u32;
}
tsv_record.bin2 = tsv_record.bin;
}
Ok(())
}
fn fill_callers_uuid(
&self,
uuid: Uuid,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
tsv_record.callers = vec![self.caller_version()];
tsv_record.sv_uuid = uuid;
Ok(())
}
fn caller_version(&self) -> String;
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error>;
}
mod conv {
use crate::annotate::genotype_string;
use crate::common::TsvContigStyle;
use crate::common::contig::ContigManager;
use crate::ped::PedigreeByName;
use crate::ped::Sex;
use noodles::vcf::header::FileFormat;
use noodles::vcf::variant::RecordBuf as VcfRecord;
use noodles::vcf::variant::record::info::field::key::{
END_CONFIDENCE_INTERVALS, POSITION_CONFIDENCE_INTERVALS,
};
use noodles::vcf::variant::record_buf::info::field::Value;
use noodles::vcf::variant::record_buf::info::field::value::Array;
use noodles::vcf::variant::record_buf::samples::sample;
use super::GenotypeInfo;
use super::VarFishStrucvarTsvRecord;
use super::VcfRecordConverter;
use super::vcf_header::{FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR};
pub fn extract_standard_cis(
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let cipos = vcf_record.info().get(POSITION_CONFIDENCE_INTERVALS);
if let Some(Some(Value::Array(Array::Integer(cipos)))) = cipos {
if cipos.len() == 2 {
tsv_record.start_ci_left =
cipos[0].ok_or(anyhow::anyhow!("CIPOS[0] is missing"))?;
tsv_record.start_ci_right =
cipos[1].ok_or(anyhow::anyhow!("CIPOS[1] is missing"))?;
} else {
anyhow::bail!("CIPOS has wrong number of elements");
}
}
let ciend = vcf_record.info().get(END_CONFIDENCE_INTERVALS);
if let Some(Some(Value::Array(Array::Integer(ciend)))) = ciend {
if ciend.len() == 2 {
tsv_record.end_ci_left = ciend[0].ok_or(anyhow::anyhow!("CIEND[0] is missing"))?;
tsv_record.end_ci_right = ciend[1].ok_or(anyhow::anyhow!("CIEND[1] is missing"))?;
} else {
anyhow::bail!("CIEND has wrong number of elements");
}
}
Ok(())
}
pub struct ClinCnvVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl ClinCnvVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for ClinCnvVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("ClinCNVv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("CN", Some(sample::Value::Float(cn))) => {
entries[sample_no].cn = Some(*cn);
}
("CN", Some(sample::Value::Integer(cn))) => {
entries[sample_no].cn = Some(*cn as f32);
}
("GQ", Some(sample::Value::Integer(gq))) => {
entries[sample_no].gq = Some(*gq);
}
("NP", Some(sample::Value::Integer(np))) => {
entries[sample_no].pc = Some(*np);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
pub struct DellyVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl DellyVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for DellyVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("DELLYv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
let mut pec = 0;
let mut src = 0;
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("GQ", Some(sample::Value::Integer(gq))) => {
entries[sample_no].gq = Some(*gq);
}
("FT", Some(sample::Value::String(ft))) => {
entries[sample_no].ft =
Some(ft.split(';').map(|s| s.to_string()).collect());
}
("DV", Some(sample::Value::Integer(dv))) => {
entries[sample_no].pev = Some(*dv);
pec += *dv;
}
("DR", Some(sample::Value::Integer(dr))) => {
pec += *dr;
}
("RV", Some(sample::Value::Integer(rv))) => {
entries[sample_no].srv = Some(*rv);
src += *rv;
}
("RR", Some(sample::Value::Integer(rr))) => {
src += *rr;
}
_ => (),
}
}
entries[sample_no].pec = Some(pec);
entries[sample_no].src = Some(src);
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
pub struct DragenCnvVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl DragenCnvVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for DragenCnvVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("DRAGEN_CNVv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("PE", Some(sample::Value::Integer(pe))) => {
entries[sample_no].pev = Some(*pe);
}
("CN", Some(sample::Value::Float(cn))) => {
entries[sample_no].cn = Some(*cn);
}
("CN", Some(sample::Value::Integer(cn))) => {
entries[sample_no].cn = Some(*cn as f32);
}
("BC", Some(sample::Value::Integer(bc))) => {
entries[sample_no].pc = Some(*bc);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
pub struct DragenSvVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl DragenSvVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for DragenSvVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("DRAGEN_SVv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
fill_genotypes_illumina_sv(pedigree, &self.samples, vcf_record, tsv_record)
}
}
pub struct GcnvVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl GcnvVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for GcnvVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("GATK_GCNVv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("CN", Some(sample::Value::Float(cn))) => {
entries[sample_no].cn = Some(*cn);
}
("CN", Some(sample::Value::Integer(cn))) => {
entries[sample_no].cn = Some(*cn as f32);
}
("NP", Some(sample::Value::Integer(np))) => {
entries[sample_no].pc = Some(*np);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
pub struct MantaVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl MantaVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
fn fill_genotypes_illumina_sv(
pedigree: &PedigreeByName,
samples: &[String],
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("GQ", Some(sample::Value::Integer(gq))) => {
entries[sample_no].gq = Some(*gq);
}
("FT", Some(sample::Value::String(ft))) => {
entries[sample_no].ft =
Some(ft.split(';').map(|s| s.to_string()).collect());
}
("PR", Some(sample::Value::Array(sample::value::Array::Integer(dv)))) => {
let ref_ = dv[0].expect("PR[0] is missing");
let var = dv[1].expect("PR[1] is missing");
entries[sample_no].pec = Some(ref_ + var);
entries[sample_no].pev = Some(var);
}
("SR", Some(sample::Value::Array(sample::value::Array::Integer(sr)))) => {
let ref_ = sr[0].expect("SR[0] is missing");
let var = sr[1].expect("SR[1] is missing");
entries[sample_no].src = Some(ref_ + var);
entries[sample_no].srv = Some(var);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
impl VcfRecordConverter for MantaVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("MANTAv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
fill_genotypes_illumina_sv(pedigree, &self.samples, vcf_record, tsv_record)
}
}
pub struct MeltVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl MeltVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for MeltVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("GL", Some(sample::Value::Array(sample::value::Array::Float(gl)))) => {
let mut gls = gl.iter().filter_map(|x| *x).collect::<Vec<_>>();
gls.sort_by(|a, b| b.partial_cmp(a).unwrap());
if gls.len() >= 2 {
entries[sample_no].gq = Some((gls[0] - gls[1]).round() as i32);
}
}
("DP", Some(sample::Value::Integer(dp))) => {
entries[sample_no].pec = Some(dp / 2);
entries[sample_no].src = Some(dp / 2 + dp % 2);
}
("AD", Some(sample::Value::Integer(ad))) => {
entries[sample_no].pev = Some(ad / 2);
entries[sample_no].srv = Some(ad / 2 + ad % 2);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
fn caller_version(&self) -> String {
format!("MELTv{}", self.version)
}
fn fill_cis(
&self,
_vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
tsv_record.start_ci_left = 0;
tsv_record.start_ci_right = 0;
tsv_record.end_ci_left = 0;
tsv_record.end_ci_right = 0;
Ok(())
}
}
pub struct PopdelVcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl PopdelVcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for PopdelVcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("POPDELv{}", self.version)
}
fn fill_cis(
&self,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
extract_standard_cis(vcf_record, tsv_record)
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("GQ", Some(sample::Value::Integer(gq))) => {
entries[sample_no].gq = Some(*gq);
}
("PR", Some(sample::Value::Array(sample::value::Array::Integer(dad)))) => {
let ref_ = dad[0].expect("DAD[0] is missing");
let var = dad[3].expect("DAD[3] is missing");
entries[sample_no].pec = Some(ref_ + var);
entries[sample_no].pev = Some(var);
}
_ => (),
}
}
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
fn process_gt(
entries: &mut [GenotypeInfo],
sample_no: usize,
gt: &String,
pedigree: &PedigreeByName,
tsv_record: &mut VarFishStrucvarTsvRecord,
) {
entries[sample_no].gt = Some(gt.to_string());
let sex = pedigree
.individuals
.get(&entries[sample_no].name)
.unwrap_or_else(|| panic!("sample must be in pedigree: {:?}", &entries[sample_no].name))
.sex;
let is_chr_x = ContigManager::is_chr_x(tsv_record.chromosome_no);
let is_chr_y = ContigManager::is_chr_y(tsv_record.chromosome_no);
let has_ref = entries[sample_no]
.gt
.as_ref()
.expect("just set")
.contains('0');
let has_alt = entries[sample_no]
.gt
.as_ref()
.expect("just set")
.contains('1');
match (is_chr_x, is_chr_y, sex, has_ref, has_alt) {
(false, false, _, false, false) => (),
(false, false, _, false, true) => tsv_record.num_hom_alt += 1,
(false, false, _, true, false) => tsv_record.num_hom_ref += 1,
(false, false, _, true, true) => tsv_record.num_het += 1,
(true, false, _, false, false) => (),
(true, false, Sex::Male, false, true) => tsv_record.num_hemi_alt += 1,
(true, false, Sex::Male, true, false) => tsv_record.num_hemi_ref += 1,
(true, false, Sex::Male, true, true) => {
tsv_record.num_hemi_alt += 1;
}
(true, false, Sex::Female, false, true) => tsv_record.num_hom_alt += 1,
(true, false, Sex::Female, true, false) => tsv_record.num_hom_ref += 1,
(true, false, Sex::Female, true, true) => tsv_record.num_het += 1,
(true, false, _, _, _) => (),
(false, true, _, false, false) => (),
(false, true, Sex::Male, false, true) => tsv_record.num_hemi_alt += 1,
(false, true, Sex::Male, true, false) => tsv_record.num_hemi_ref += 1,
(false, true, Sex::Male, true, true) => {
tsv_record.num_hemi_alt += 1;
}
(false, true, Sex::Female, _, _) => {}
(false, true, _, _, _) => (),
(true, true, _, _, _) => panic!("cannot be both chrX and chrY"),
}
}
fn postproc_gt_cn(tsv_record: &mut VarFishStrucvarTsvRecord, pedigree: &PedigreeByName) {
let is_chr_x = ContigManager::is_chr_x(tsv_record.chromosome_no);
let is_chr_y = ContigManager::is_chr_y(tsv_record.chromosome_no);
for entry in &tsv_record.genotype.entries {
let has_ref = entry.gt.as_ref().map(|s| s.contains('0')).unwrap_or(false);
let has_alt = entry.gt.as_ref().map(|s| s.contains('1')).unwrap_or(false);
if !has_ref && !has_alt {
let sex = pedigree
.individuals
.get(&entry.name)
.unwrap_or_else(|| panic!("sample must be in pedigree: {:?}", &entry.name))
.sex;
let expected_cn = match (sex, is_chr_x, is_chr_y) {
(_, false, false) => Some(2.),
(Sex::Male, false, true) | (Sex::Male, true, false) => Some(1.),
(Sex::Female, true, false) => Some(2.),
_ => None,
};
if let (Some(cn), Some(expected_cn)) = (entry.cn, expected_cn) {
let delta = (cn - expected_cn).abs();
if delta.round() != delta {
tracing::warn!(
"Fractional CN value: {} and expected {} for sample {}, rounding.",
cn,
expected_cn,
entry.name,
);
}
let delta = delta.round() as i32;
if expected_cn == 1. {
tsv_record.num_hemi_alt += delta;
} else if delta == 0 {
tsv_record.num_hom_ref += 1;
} else if delta == 1 {
tsv_record.num_het += 1;
} else {
tsv_record.num_hom_alt += 1;
}
}
}
}
}
pub struct Sniffles2VcfRecordConverter {
pub samples: Vec<String>,
pub version: String,
pub assembly: String,
pub tsv_contig_style: TsvContigStyle,
pub contig_manager: ContigManager,
}
impl Sniffles2VcfRecordConverter {
pub fn new<T: AsRef<str>>(
version: &str,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Self {
Self {
samples: samples.iter().map(|s| s.as_ref().to_string()).collect(),
version: version.to_string(),
assembly: assembly.to_string(),
tsv_contig_style,
contig_manager: ContigManager::new(assembly),
}
}
}
impl VcfRecordConverter for Sniffles2VcfRecordConverter {
fn assembly(&self) -> &str {
&self.assembly
}
fn tsv_contig_style(&self) -> TsvContigStyle {
self.tsv_contig_style
}
fn contig_manager(&self) -> &ContigManager {
&self.contig_manager
}
fn caller_version(&self) -> String {
format!("SNIFFLESv{}", self.version)
}
fn fill_cis(
&self,
_vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
tsv_record.start_ci_left = 0;
tsv_record.start_ci_right = 0;
tsv_record.end_ci_left = 0;
tsv_record.end_ci_right = 0;
Ok(())
}
fn fill_genotypes(
&self,
pedigree: &PedigreeByName,
vcf_record: &VcfRecord,
tsv_record: &mut VarFishStrucvarTsvRecord,
) -> Result<(), anyhow::Error> {
let mut entries: Vec<GenotypeInfo> = vec![Default::default(); self.samples.len()];
for (sample_no, sample) in vcf_record.samples().values().enumerate() {
entries[sample_no].name.clone_from(&self.samples[sample_no]);
let mut src = 0;
for (key, value) in sample.keys().as_ref().iter().zip(sample.values().iter()) {
match (key.as_ref(), value) {
("GT", Some(sample::Value::String(gt))) => {
process_gt(&mut entries, sample_no, gt, pedigree, tsv_record);
}
("GT", Some(sample::Value::Genotype(gt))) => {
let gt = genotype_string(
gt,
FileFormat::new(FILE_FORMAT_MAJOR, FILE_FORMAT_MINOR),
);
process_gt(&mut entries, sample_no, >, pedigree, tsv_record);
}
("GQ", Some(sample::Value::Integer(gq))) => {
entries[sample_no].gq = Some(*gq);
}
("DV", Some(sample::Value::Integer(dv))) => {
entries[sample_no].srv = Some(*dv);
src += dv;
}
("DR", Some(sample::Value::Integer(dr))) => {
src += *dr;
}
_ => (),
}
}
entries[sample_no].src = Some(src);
}
tsv_record.genotype.entries = entries;
postproc_gt_cn(tsv_record, pedigree);
Ok(())
}
}
}
pub fn build_vcf_record_converter<T: AsRef<str>>(
caller: &SvCaller,
samples: &[T],
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Box<dyn VcfRecordConverter> {
match caller {
SvCaller::ClinCnv { version } => Box::new(conv::ClinCnvVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Delly { version } => Box::new(conv::DellyVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Manta { version } => Box::new(conv::MantaVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Melt { version } => Box::new(conv::MeltVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::DragenSv { version } => Box::new(conv::DragenSvVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::DragenCnv { version } => Box::new(conv::DragenCnvVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Gcnv { version } => Box::new(conv::GcnvVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Popdel { version } => Box::new(conv::PopdelVcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
SvCaller::Sniffles2 { version } => Box::new(conv::Sniffles2VcfRecordConverter::new(
version,
samples,
assembly,
tsv_contig_style,
)),
}
}
pub async fn run_vcf_to_jsonl(
pedigree: &PedigreeByName,
reader: &mut impl NoodlesVariantReader,
header: &VcfHeader,
sv_caller: &SvCaller,
tmp_dir: &tempfile::TempDir,
cov_readers: &mut HashMap<String, maelstrom::Reader>,
rng: &mut StdRng,
assembly: &str,
tsv_contig_style: TsvContigStyle,
) -> Result<(), Error> {
tracing::debug!("opening temporary files in {}", tmp_dir.path().display());
let mut tmp_files = {
let mut files = Vec::new();
for i in 1..=25 {
let path = tmp_dir.path().join(format!("chrom-{}.jsonl", i));
tracing::debug!(" path = {}", path.display());
let file = OpenOptions::new()
.create(true)
.append(true)
.open(path)
.expect("could not open temporary file");
files.push(file);
}
files
};
let samples = header
.sample_names()
.iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
let converter = build_vcf_record_converter(sv_caller, &samples, assembly, tsv_contig_style);
let mut uuid_buf = [0u8; 16];
let mut records = reader.records(header).await;
while let Some(record) = records
.try_next()
.await
.map_err(|e| anyhow::anyhow!("problem reading VCF record: {}", e))?
{
rng.fill_bytes(&mut uuid_buf);
let uuid = Uuid::from_bytes(uuid_buf);
if record.alternate_bases().is_empty()
|| record.alternate_bases().as_ref() == ["<*>".to_string()]
{
tracing::warn!("skipping REF-only / empty ALT record {:?}", record);
continue;
}
let mut record = converter.convert(pedigree, &record, uuid)?;
annotate_cov_mq(&mut record, cov_readers)?;
if (1..=25).contains(&record.chromosome_no) {
let out_jsonl = &mut tmp_files[record.chromosome_no as usize - 1];
jsonl::write(out_jsonl, &record)?;
} else {
tracing::warn!(
"skipping record on chromosome {} (not in canonical set)",
record.chromosome
);
}
}
Ok(())
}
fn annotate_cov_mq(
record: &mut VarFishStrucvarTsvRecord,
cov_readers: &mut HashMap<String, maelstrom::Reader>,
) -> Result<(), anyhow::Error> {
for entry in record.genotype.entries.iter_mut() {
if let Some(reader) = cov_readers.get_mut(&entry.name) {
if matches!(record.sv_type, SvType::Del | SvType::Dup | SvType::Inv) {
let res = reader
.read(&record.chromosome, record.start..record.end)
.map_err(|e| {
anyhow::anyhow!(
"problem querying {}:{}-{} for {}: {}",
&record.chromosome,
record.start,
record.end,
entry.name,
e
)
})?;
entry.anc = Some(res.mean_coverage as f32);
entry.amq = Some(res.mean_mapq as i32);
}
} else {
tracing::warn!("no coverage information for sample {}", entry.name);
}
}
Ok(())
}
pub fn read_and_cluster_for_contig(
tmp_dir: &tempfile::TempDir,
contig_no: usize,
slack_ins: i32,
slack_bnd: i32,
min_overlap: f32,
) -> Result<Vec<VarFishStrucvarTsvRecord>, anyhow::Error> {
let jsonl_path = tmp_dir.path().join(format!("chrom-{}.jsonl", contig_no));
tracing::debug!("clustering files from {}", jsonl_path.display());
let mut reader = File::open(jsonl_path).map(BufReader::new)?;
let mut records: Vec<VarFishStrucvarTsvRecord> = Vec::new();
let mut clusters: Vec<Vec<usize>> = vec![];
let mut tree: IntervalTree<i32, usize> = IntervalTree::new();
loop {
match jsonl::read::<_, VarFishStrucvarTsvRecord>(&mut reader) {
Ok(record) => {
let slack = match record.sv_type {
SvType::Ins => slack_ins,
SvType::Bnd => slack_bnd,
_ => 0,
};
let query = match record.sv_type {
SvType::Ins | SvType::Bnd => {
let query_start = std::cmp::max(1, record.start - 1 - slack);
let query_end = record.start + slack;
query_start..query_end
}
_ => {
let query_start = std::cmp::max(1, record.start - 1 - slack);
let query_end = record.end + slack;
query_start..query_end
}
};
let mut found_any_cluster = false;
for mut it_tree in tree.find_mut(&query) {
let cluster_idx = *it_tree.data();
let mut match_all_in_cluster = true;
for it_cluster in &clusters[cluster_idx] {
let record_id = it_cluster;
let match_this_range = match record.sv_type {
SvType::Bnd | SvType::Ins => true,
_ => {
let ovl = record.overlap(&records[*record_id]);
assert!(ovl >= 0f32);
ovl >= min_overlap
}
};
let match_this_sv_type = record.sv_type == records[*record_id].sv_type;
match_all_in_cluster =
match_all_in_cluster && match_this_range && match_this_sv_type;
}
if match_all_in_cluster {
clusters[cluster_idx].push(records.len());
found_any_cluster = true;
break;
}
}
if !found_any_cluster {
match record.sv_type {
SvType::Ins | SvType::Bnd => {
tree.insert((record.start - 1)..record.start, clusters.len());
}
_ => {
tree.insert((record.start - 1)..record.end, clusters.len());
}
};
clusters.push(vec![records.len()]);
}
records.push(record);
}
Err(jsonl::ReadError::Eof) => {
break; }
_ => anyhow::bail!("Problem reading JSONL file"),
}
}
let mut result = Vec::new();
for cluster in &clusters {
let mut record = records[cluster[0]].clone();
for other_idx in cluster[1..].iter() {
record.merge(&records[*other_idx]);
}
result.push(record);
}
result.sort_by(|a, b| a.start.cmp(&b.start));
Ok(result)
}
fn normalize_assembly(assembly: &str) -> Result<String, anyhow::Error> {
let normalized = match assembly.to_lowercase().trim() {
"grch37" | "hg19" => "GRCh37",
"grch38" | "hg38" => "GRCh38",
_ => {
return Err(anyhow::anyhow!(
"Unsupported assembly '{}'. Supported assemblies: GRCh37, GRCh38, hg19, hg38",
assembly
));
}
};
Ok(normalized.to_string())
}
pub async fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("config = {:#?}", &args);
tracing::info!("Loading pedigree...");
let pedigree = PedigreeByName::from_path(&args.path_input_ped)?;
tracing::info!("... done loading pedigree");
let assembly = normalize_assembly(&args.assembly)?;
let contig_manager = ContigManager::new(&assembly);
let header = {
let mut reader = VariantReaderBuilder::default().build_from_path(
args.path_input_vcf
.first()
.expect("must have at least one input VCF"),
)?;
reader.read_header()?
};
tracing::info!("Using input assembly {:?}", &assembly);
let mut rng = if let Some(rng_seed) = args.rng_seed {
StdRng::seed_from_u64(rng_seed)
} else {
rand::make_rng()
};
tracing::info!("Opening coverage tracks...");
let mut cov_readers: HashMap<String, maelstrom::Reader> = HashMap::new();
for path_cov in &args.path_cov_vcf {
tracing::debug!(" path: {}", path_cov);
let reader = maelstrom::Reader::from_path(path_cov)?;
cov_readers.insert(reader.sample_name.clone(), reader);
}
tracing::info!("... done opening coverage tracks");
let tmp_dir = tempfile::Builder::new().prefix("mehari").tempdir()?;
tracing::info!("Converting input VCF files to temporary files...");
for path_input in &args.path_input_vcf {
tracing::debug!("processing VCF file {}", path_input);
let sv_caller = {
let mut reader = open_variant_reader(path_input).await?;
guess_sv_caller(&mut reader).await?
};
tracing::debug!("guessed caller/version to be {:?}", &sv_caller);
let mut reader = open_variant_reader(path_input).await?;
let header: VcfHeader = reader.read_header().await?;
run_vcf_to_jsonl(
&pedigree,
&mut reader,
&header,
&sv_caller,
&tmp_dir,
&mut cov_readers,
&mut rng,
&assembly,
args.tsv_contig_style,
)
.await?;
}
tracing::info!("... done converting input files.");
match args.output_format {
OutputFormat::Vcf => {
tracing::info!(
"Writing merged and annotated VCF output to {}",
&args.output
);
let file_date = args
.file_date
.as_ref()
.cloned()
.unwrap_or(Utc::now().date_naive().format("%Y%m%d").to_string());
let header_out = vcf_header::build(&contig_manager, &pedigree, &file_date, &header)?;
let mut writer = open_variant_writer(&args.output).await?;
writer.set_assembly(assembly);
writer.write_noodles_header(&header_out).await?;
tracing::info!("Clustering SVs to output...");
for contig_no in 1..=25 {
tracing::info!(" clustering contig: {}", CANONICAL[contig_no - 1]);
let clustered_records = read_and_cluster_for_contig(
&tmp_dir,
contig_no,
args.slack_ins,
args.slack_bnd,
args.min_overlap,
)?;
for tsv_record in clustered_records {
let vcf_record: VcfRecord = tsv_record.try_into()?;
let annotated_variant = crate::annotate::seqvars::AnnotatedVariant {
vcf: vcf_record,
annotation: crate::annotate::seqvars::VariantAnnotation {
consequences: vec![],
frequencies: None,
clinvar: None,
},
};
writer
.write_annotated_record(&header_out, annotated_variant)
.await?;
}
}
tracing::info!("... done clustering SVs to output.");
writer.shutdown().await?;
}
OutputFormat::Tsv => {
tracing::info!(
"Writing merged and annotated TSV output to {}",
&args.output
);
let mut tsv_writer = VarFishStrucvarTsvWriter::with_path(&args.output)?;
tsv_writer.write_header()?;
for contig_no in 1..=25 {
tracing::info!(" clustering contig: {}", CANONICAL[contig_no - 1]);
let clustered_records = read_and_cluster_for_contig(
&tmp_dir,
contig_no,
args.slack_ins,
args.slack_bnd,
args.min_overlap,
)?;
for tsv_record in clustered_records {
tsv_writer.write_record(&tsv_record)?;
}
}
tsv_writer.flush()?;
}
}
tracing::info!("... done writing final output.");
Ok(())
}
pub mod bnd {
use super::PeOrientation;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Default, serde::Deserialize, serde::Serialize)]
pub struct Breakend {
pub chrom: String,
pub pos: i32,
pub ref_base: String,
pub alt_base: String,
pub leading_base: bool,
pub left_open: bool,
pub pe_orientation: PeOrientation,
}
#[derive(Debug, Clone, Copy)]
enum State {
Initial,
LeadingAlt,
Chrom,
Pos,
TrailingAlt,
}
impl Breakend {
pub fn from_ref_alt_str(reference: &str, alternative: &str) -> Result<Self, anyhow::Error> {
let mut left_open = true; let mut leading_bracket = false; let mut alt_base = String::new();
let mut chrom = String::new();
let mut pos = String::new();
let mut state: State = State::Initial;
for c in alternative.chars() {
match state {
State::Initial => {
if c == ']' || c == '[' {
left_open = c == ']';
leading_bracket = true;
state = State::Chrom;
} else {
leading_bracket = false;
state = State::LeadingAlt;
alt_base.push(c);
}
}
State::LeadingAlt => {
if c == ']' || c == '[' {
state = State::Chrom;
} else {
alt_base.push(c);
}
}
State::Chrom => {
if c == ':' {
state = State::Pos;
} else {
chrom.push(c);
}
}
State::Pos => {
if c == ']' || c == '[' {
left_open = c == ']';
state = State::TrailingAlt;
} else {
pos.push(c);
}
}
State::TrailingAlt => {
if !leading_bracket {
panic!("can only have trailing bases if leading bracket");
} else {
alt_base.push(c);
}
}
}
}
let pos: i32 = pos.parse()?;
let pe_orientation = match (!leading_bracket, left_open) {
(true, true) => PeOrientation::FiveToFive,
(true, false) => PeOrientation::FiveToThree,
(false, true) => PeOrientation::ThreeToFive,
(false, false) => PeOrientation::ThreeToThree,
};
Ok(Self {
chrom,
pos,
ref_base: reference.to_string(),
alt_base,
leading_base: !leading_bracket,
left_open,
pe_orientation,
})
}
}
}
#[cfg(test)]
mod test {
use std::fs::File;
use clap_verbosity_flag::Verbosity;
use noodles::vcf::variant::io::Write;
use pretty_assertions::assert_eq;
use rstest::rstest;
use temp_testdir::TempDir;
use uuid::Uuid;
use super::{
Args, OutputFormat, VarFishStrucvarTsvWriter, VcfHeader, VcfRecord, VcfRecordConverter,
bnd::Breakend,
build_vcf_record_converter,
conv::{
ClinCnvVcfRecordConverter, DellyVcfRecordConverter, DragenCnvVcfRecordConverter,
DragenSvVcfRecordConverter, GcnvVcfRecordConverter, MantaVcfRecordConverter,
MeltVcfRecordConverter, PopdelVcfRecordConverter,
},
guess_sv_caller, run, vcf_header,
};
use crate::common::TsvContigStyle;
use crate::common::contig::ContigManager;
use crate::{
annotate::strucvars::{
GenotypeCalls, GenotypeInfo, InfoRecord, PeOrientation, SvSubType, SvType,
VarFishStrucvarTsvRecord,
},
common::noodles::open_variant_reader,
ped::{Disease, Individual, PedigreeByName, Sex},
};
#[test]
fn parse_bnd() -> Result<(), anyhow::Error> {
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("G", "A]17:198982]")?);
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("G", "A[17:198982[")?);
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("G", "]17:198982]A")?);
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("G", "[17:198982[A")?);
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("GA", "[17:198982[TA")?);
insta::assert_yaml_snapshot!(Breakend::from_ref_alt_str("GA", "TA]17:198982]")?);
Ok(())
}
fn run_test_vcf_to_jsonl(
pedigree: &PedigreeByName,
path_input_vcf: &str,
path_expected_jsonl: &str,
converter: &dyn VcfRecordConverter,
) -> Result<(), anyhow::Error> {
let out_file_name = "out.jsonl";
let temp = TempDir::default();
let out_jsonl = File::create(temp.join(out_file_name))?;
let mut reader =
noodles::vcf::io::reader::Builder::default().build_from_path(path_input_vcf)?;
let header_in = reader.read_header()?;
let mut bytes = [
0xa1, 0xa2, 0xa3, 0xa4, 0xb1, 0xb2, 0xc1, 0xc2, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
0xd7, 0xd8,
];
let mut records = reader.record_bufs(&header_in);
loop {
match records.next() {
Some(record) => {
let uuid = Uuid::from_bytes(bytes);
let record = converter.convert(pedigree, &record?, uuid)?;
jsonl::write(&out_jsonl, &record)?;
}
_ => {
break; }
}
bytes[0] += 1;
}
drop(out_jsonl);
let expected = std::fs::read_to_string(path_expected_jsonl)?;
let actual = std::fs::read_to_string(temp.join(out_file_name))?;
assert_eq!(expected, actual);
Ok(())
}
fn vcf_samples(path: &str) -> Result<Vec<String>, anyhow::Error> {
let mut reader = noodles::vcf::io::reader::Builder::default().build_from_path(path)?;
let header: VcfHeader = reader.read_header()?;
Ok(header
.sample_names()
.iter()
.map(|x| x.to_string())
.collect::<Vec<_>>())
}
fn sample_ped(samples: &Vec<String>) -> PedigreeByName {
let mut result = PedigreeByName::default();
for sample in samples {
let indiv = Individual {
family: String::from("FAM"),
name: sample.clone(),
father: if samples.contains(&String::from("father"))
&& sample != "father"
&& sample != "mother"
{
Some(String::from("father"))
} else {
None
},
mother: if samples.contains(&String::from("mother"))
&& sample != "father"
&& sample != "mother"
{
Some(String::from("mother"))
} else {
None
},
sex: if sample == "mother" {
Sex::Female
} else {
Sex::Male
},
disease: if sample == "mother" || sample == "father" {
Disease::Unaffected
} else {
Disease::Affected
},
};
result.individuals.insert(sample.clone(), indiv);
}
result
}
#[test]
fn vcf_to_jsonl_clincnv_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/clincnv-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/clincnv-min.out.jsonl",
&ClinCnvVcfRecordConverter::new("1.17.0", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[test]
fn vcf_to_jsonl_delly_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/delly2-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/delly2-min.out.jsonl",
&DellyVcfRecordConverter::new("1.1.3", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[test]
fn vcf_to_jsonl_dragen_sv_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/dragen-sv-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/dragen-sv-min.out.jsonl",
&DragenSvVcfRecordConverter::new(
"07.021.624.3.10.4",
&samples,
"GRCh37",
TsvContigStyle::Auto,
),
)
}
#[test]
fn vcf_to_jsonl_dragen_cnv_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/dragen-cnv-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/dragen-cnv-min.out.jsonl",
&DragenCnvVcfRecordConverter::new(
"07.021.624.3.10.4",
&samples,
"GRCh37",
TsvContigStyle::Auto,
),
)
}
#[test]
fn vcf_to_jsonl_gcnv_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/gcnv-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/gcnv-min.out.jsonl",
&GcnvVcfRecordConverter::new("4.3.0.0", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[test]
fn vcf_to_jsonl_manta_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/manta-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/manta-min.out.jsonl",
&MantaVcfRecordConverter::new("1.6.0", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[test]
fn vcf_to_jsonl_melt_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/melt-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/melt-min.out.jsonl",
&MeltVcfRecordConverter::new("2.2.2", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[test]
fn vcf_to_jsonl_popdel_min() -> Result<(), anyhow::Error> {
let path_input_vcf = "tests/data/annotate/strucvars/popdel-min.vcf";
let samples = vcf_samples(path_input_vcf)?;
let pedigree: PedigreeByName = sample_ped(&samples);
run_test_vcf_to_jsonl(
&pedigree,
path_input_vcf,
"tests/data/annotate/strucvars/popdel-min.out.jsonl",
&PopdelVcfRecordConverter::new("1.1.2", &samples, "GRCh37", TsvContigStyle::Auto),
)
}
#[tokio::test]
async fn vcf_to_jsonl_with_detection() -> Result<(), anyhow::Error> {
let keys = &[
"delly2",
"dragen-cnv",
"dragen-sv",
"gcnv",
"manta",
"popdel",
"sniffles2",
];
for key in keys {
let path_input_vcf = format!("tests/data/annotate/strucvars/{}-min.vcf", key);
let path_expected_txt = format!("tests/data/annotate/strucvars/{}-min.out.jsonl", key);
let samples = vcf_samples(&path_input_vcf)?;
let pedigree: PedigreeByName = sample_ped(&samples);
let mut reader = open_variant_reader(&path_input_vcf).await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
let converter =
build_vcf_record_converter(&sv_caller, &samples, "GRCh37", TsvContigStyle::Auto);
run_test_vcf_to_jsonl(
&pedigree,
&path_input_vcf,
&path_expected_txt,
converter.as_ref(),
)?;
}
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_clincnv() -> Result<(), anyhow::Error> {
let mut reader =
open_variant_reader("tests/data/annotate/strucvars/clincnv-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_delly() -> Result<(), anyhow::Error> {
let mut reader =
open_variant_reader("tests/data/annotate/strucvars/delly2-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_dragen_sv() -> Result<(), anyhow::Error> {
let mut reader =
open_variant_reader("tests/data/annotate/strucvars/dragen-sv-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_dragen_cnv() -> Result<(), anyhow::Error> {
let mut reader =
open_variant_reader("tests/data/annotate/strucvars/dragen-cnv-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_gcnv() -> Result<(), anyhow::Error> {
let mut reader = open_variant_reader("tests/data/annotate/strucvars/gcnv-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_manta() -> Result<(), anyhow::Error> {
let mut reader = open_variant_reader("tests/data/annotate/strucvars/manta-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_melt() -> Result<(), anyhow::Error> {
let mut reader = open_variant_reader("tests/data/annotate/strucvars/melt-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[tokio::test]
async fn guess_sv_caller_popdel() -> Result<(), anyhow::Error> {
let mut reader =
open_variant_reader("tests/data/annotate/strucvars/popdel-min.vcf").await?;
let sv_caller = guess_sv_caller(&mut reader).await?;
insta::assert_yaml_snapshot!(sv_caller);
Ok(())
}
#[test]
fn build_vcf_header_37_no_pedigree() -> Result<(), anyhow::Error> {
let contig_manager = ContigManager::new("GRCh37");
let header = vcf_header::build(
&contig_manager,
&Default::default(),
"20150314",
&noodles::vcf::Header::builder().build(),
)?;
let mut writer = noodles::vcf::io::Writer::new(Vec::new());
writer.write_header(&header)?;
let actual = std::str::from_utf8(&writer.get_ref()[..])?;
let expected =
std::fs::read_to_string("tests/data/annotate/strucvars/header-grch37-noped.vcf")?;
assert_eq!(actual, expected);
Ok(())
}
#[test]
fn build_vcf_header_37_trio() -> Result<(), anyhow::Error> {
let contig_manager = ContigManager::new("GRCh37");
let header = vcf_header::build(
&contig_manager,
&example_trio(),
"20150314",
&example_trio_header(),
)?;
let mut writer = noodles::vcf::io::Writer::new(Vec::new());
writer.write_header(&header)?;
let actual = std::str::from_utf8(&writer.get_ref()[..])?;
let expected =
std::fs::read_to_string("tests/data/annotate/strucvars/header-grch37-trio.vcf")?;
assert_eq!(actual, expected);
Ok(())
}
fn example_trio_header() -> noodles::vcf::Header {
let mut builder = noodles::vcf::Header::builder();
for indiv in example_trio().individuals.values() {
builder = builder.add_sample_name(indiv.name.clone());
}
builder.build()
}
fn example_trio() -> PedigreeByName {
let individuals = indexmap::IndexMap::from_iter(vec![
(
String::from("index"),
Individual {
family: String::from("FAM"),
name: String::from("index"),
father: Some(String::from("father")),
mother: Some(String::from("mother")),
sex: Sex::Female,
disease: Disease::Affected,
},
),
(
String::from("father"),
Individual {
family: String::from("FAM"),
name: String::from("father"),
father: None,
mother: None,
sex: Sex::Male,
disease: Disease::Unaffected,
},
),
(
String::from("mother"),
Individual {
family: String::from("FAM"),
name: String::from("mother"),
father: None,
mother: None,
sex: Sex::Female,
disease: Disease::Unaffected,
},
),
]);
PedigreeByName { individuals }
}
#[test]
fn build_vcf_header_38_no_pedigree() -> Result<(), anyhow::Error> {
let contig_manager = ContigManager::new("GRCh38");
let header = vcf_header::build(
&contig_manager,
&Default::default(),
"20150314",
&noodles::vcf::Header::builder().build(),
)?;
let mut writer = noodles::vcf::io::Writer::new(Vec::new());
writer.write_header(&header)?;
let actual = std::str::from_utf8(&writer.get_ref()[..])?;
let expected =
std::fs::read_to_string("tests/data/annotate/strucvars/header-grch38-noped.vcf")?;
assert_eq!(actual, expected);
Ok(())
}
fn example_records() -> Vec<VarFishStrucvarTsvRecord> {
let bytes = [
0xa1, 0xa2, 0xa3, 0xa4, 0xb1, 0xb2, 0xc1, 0xc2, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
0xd7, 0xd8,
];
let mut bytes2 = bytes;
bytes2[0] += 1;
vec![
VarFishStrucvarTsvRecord {
release: String::from("GRCh37"),
chromosome: String::from("1"),
chromosome_no: 1,
bin: 512,
chromosome2: String::from("1"),
chromosome_no2: 1,
bin2: 512,
pe_orientation: PeOrientation::FiveToFive,
start: 1042,
end: 2042,
start_ci_left: 0,
start_ci_right: 0,
end_ci_left: 0,
end_ci_right: 0,
case_id: 0,
set_id: 0,
sv_uuid: Uuid::from_bytes(bytes),
callers: vec![String::from("MANTAv1.1.2")],
sv_type: SvType::Inv,
sv_sub_type: SvSubType::Inv,
info: Default::default(),
num_hom_alt: 0,
num_hom_ref: 2,
num_het: 1,
num_hemi_alt: 0,
num_hemi_ref: 0,
genotype: GenotypeCalls {
entries: vec![
GenotypeInfo {
name: String::from("index"),
gt: Some(String::from("0/1")),
ft: Some(vec![String::from("PASS")]),
gq: Some(99),
pec: Some(143),
pev: Some(43),
src: Some(143),
srv: Some(43),
amq: Some(99),
cn: Some(1.),
anc: Some(0.5),
pc: Some(10),
},
GenotypeInfo {
name: String::from("father"),
gt: Some(String::from("0/0")),
ft: Some(vec![String::from("PASS")]),
gq: Some(98),
pec: Some(43),
pev: Some(0),
src: Some(44),
srv: Some(0),
amq: Some(98),
cn: Some(2.),
anc: Some(1.0),
pc: Some(10),
},
GenotypeInfo {
name: String::from("mother"),
gt: Some(String::from("0/0")),
ft: Some(vec![String::from("PASS")]),
gq: Some(97),
pec: Some(32),
pev: Some(0),
src: Some(33),
srv: Some(0),
amq: Some(97),
cn: Some(2.),
anc: Some(1.0),
pc: Some(10),
},
],
},
},
VarFishStrucvarTsvRecord {
release: String::from("GRCh37"),
chromosome: String::from("1"),
chromosome_no: 1,
bin: 512,
chromosome2: String::from("17"),
chromosome_no2: 1,
bin2: 1234,
pe_orientation: PeOrientation::FiveToFive,
start: 10_1042,
end: 198_982,
start_ci_left: 0,
start_ci_right: 0,
end_ci_left: 0,
end_ci_right: 0,
case_id: 0,
set_id: 0,
sv_uuid: Uuid::from_bytes(bytes2),
callers: vec![String::from("MANTAv1.1.2")],
sv_type: SvType::Bnd,
sv_sub_type: SvSubType::Bnd,
info: InfoRecord {
alt: Some(String::from("]17:198982]A")),
},
num_hom_alt: 0,
num_hom_ref: 0,
num_het: 3,
num_hemi_alt: 0,
num_hemi_ref: 0,
genotype: GenotypeCalls {
entries: vec![
GenotypeInfo {
name: String::from("index"),
gt: Some(String::from("0/1")),
ft: Some(vec![String::from("PASS")]),
gq: Some(99),
pec: Some(143),
pev: Some(43),
src: Some(143),
srv: Some(43),
amq: Some(99),
cn: Some(1.),
anc: Some(0.5),
pc: Some(10),
},
GenotypeInfo {
name: String::from("father"),
gt: Some(String::from("0/1")),
ft: Some(vec![String::from("PASS")]),
gq: Some(99),
pec: Some(143),
pev: Some(43),
src: Some(143),
srv: Some(43),
amq: Some(99),
cn: Some(1.),
anc: Some(0.5),
pc: Some(10),
},
GenotypeInfo {
name: String::from("mother"),
gt: Some(String::from("0/1")),
ft: Some(vec![String::from("PASS")]),
gq: Some(99),
pec: Some(143),
pev: Some(43),
src: Some(143),
srv: Some(43),
amq: Some(99),
cn: Some(1.),
anc: Some(0.5),
pc: Some(10),
},
],
},
},
]
}
#[test]
fn write_vcf_from_varfish_records() -> Result<(), anyhow::Error> {
let contig_manager = ContigManager::new("GRCh38");
let header = vcf_header::build(
&contig_manager,
&example_trio(),
"20150314",
&example_trio_header(),
)?;
let mut writer = noodles::vcf::io::Writer::new(Vec::new());
writer.write_header(&header)?;
for varfish_record in example_records() {
let vcf_record: VcfRecord = varfish_record.try_into()?;
writer.write_variant_record(&header, &vcf_record)?;
}
let actual = std::str::from_utf8(&writer.get_ref()[..])?;
let expected = std::fs::read_to_string("tests/data/annotate/strucvars/example-grch38.vcf")?;
assert_eq!(actual, expected);
Ok(())
}
#[tokio::test]
async fn write_tsv_from_varfish_records() -> Result<(), anyhow::Error> {
let temp = TempDir::default();
{
let mut writer = VarFishStrucvarTsvWriter::with_path(temp.join("out.tsv"))?;
writer.write_header()?;
for varfish_record in example_records() {
writer.write_record(&varfish_record)?;
}
}
let expected = std::fs::read_to_string("tests/data/annotate/strucvars/example-grch38.tsv")?;
let actual = std::fs::read_to_string(temp.join("out.tsv"))?;
assert_eq!(actual, expected);
Ok(())
}
#[rstest]
#[case(true)]
#[case(false)]
#[tokio::test]
async fn test_with_maelstrom_reader(#[case] is_tsv: bool) -> Result<(), anyhow::Error> {
let temp = TempDir::default();
let args_common = crate::common::Args {
verbose: Verbosity::new(0, 1),
};
let suffix = if is_tsv { ".tsv" } else { ".vcf" };
let out_path = temp.join(format!("out{}", suffix));
let args = Args {
assembly: "GRCh37".into(),
path_input_ped: String::from("tests/data/annotate/strucvars/maelstrom/delly2-min.ped"),
path_input_vcf: vec![String::from(
"tests/data/annotate/strucvars/maelstrom/delly2-min.vcf",
)],
output: out_path.display().to_string(),
output_format: if is_tsv {
OutputFormat::Tsv
} else {
OutputFormat::Vcf
},
max_var_count: None,
path_cov_vcf: vec![String::from(
"tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz",
)],
file_date: Some(String::from("20230421")),
min_overlap: 0.8,
slack_bnd: 50,
slack_ins: 50,
rng_seed: Some(42),
tsv_contig_style: TsvContigStyle::Auto,
};
run(&args_common, &args).await?;
let expected = std::fs::read_to_string(format!(
"tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom{}",
suffix
))?;
let actual = std::fs::read_to_string(out_path)?;
assert_eq!(actual, expected);
Ok(())
}
#[tokio::test]
async fn test_sample_order_consistency() -> Result<(), anyhow::Error> {
let temp = TempDir::default();
let args_common = crate::common::Args {
verbose: Verbosity::new(0, 1),
};
let out_path = temp.join("out.vcf");
let args = Args {
assembly: "GRCh38".into(),
path_input_ped: String::from("tests/data/annotate/strucvars/test.order.ped"),
path_input_vcf: vec![String::from("tests/data/annotate/strucvars/test.order.vcf")],
output: out_path.display().to_string(),
output_format: OutputFormat::Vcf,
max_var_count: None,
path_cov_vcf: vec![],
file_date: Some(String::from("20250121")),
min_overlap: 0.8,
slack_bnd: 50,
slack_ins: 50,
rng_seed: Some(42),
tsv_contig_style: TsvContigStyle::Passthrough,
};
run(&args_common, &args).await?;
let expected =
std::fs::read_to_string("tests/data/annotate/strucvars/test.order.expected.vcf")?;
let actual = std::fs::read_to_string(&out_path)?;
assert_eq!(actual, expected);
Ok(())
}
}