use serde_json::{Map, Value, json};
use crate::consequence::{Consequence, ConsequenceResult};
use crate::types::Strand;
pub(crate) fn to_vep_json_array(
chrom: &str,
pos: u64,
ref_allele: &[u8],
alt_allele: &[u8],
assembly: &str,
results: &[ConsequenceResult],
) -> Value {
Value::Array(vec![build_annotation(
chrom, pos, ref_allele, alt_allele, assembly, results,
)])
}
fn build_annotation(
chrom: &str,
pos: u64,
ref_allele: &[u8],
alt_allele: &[u8],
assembly: &str,
results: &[ConsequenceResult],
) -> Value {
let seq_region = chrom.strip_prefix("chr").unwrap_or(chrom);
let start = pos + 1;
let end = pos + (ref_allele.len().max(1) as u64);
let allele_string = format!(
"{}/{}",
render_allele(ref_allele),
render_allele(alt_allele),
);
let most_severe = results
.iter()
.flat_map(|r| r.consequences.iter())
.min_by_key(|c| c.severity_rank())
.map(Consequence::as_str)
.unwrap_or("intergenic_variant");
let transcript_consequences: Vec<Value> =
results.iter().map(build_transcript_consequence).collect();
json!({
"seq_region_name": seq_region,
"start": start,
"end": end,
"strand": 1,
"allele_string": allele_string,
"assembly_name": assembly,
"most_severe_consequence": most_severe,
"transcript_consequences": transcript_consequences,
"colocated_variants": [],
})
}
fn build_transcript_consequence(r: &ConsequenceResult) -> Value {
let mut map = Map::new();
map.insert("transcript_id".into(), Value::String(r.transcript.clone()));
map.insert("gene_symbol".into(), Value::String(r.gene_symbol.clone()));
map.insert(
"biotype".into(),
Value::String(r.biotype.as_str().to_string()),
);
map.insert(
"strand".into(),
Value::from(match r.strand {
Strand::Plus => 1_i8,
Strand::Minus => -1_i8,
}),
);
map.insert("impact".into(), Value::String(r.impact.to_string()));
map.insert(
"consequence_terms".into(),
Value::Array(
r.consequences
.iter()
.map(|c| Value::String(c.as_str().to_string()))
.collect(),
),
);
if let Some(c) = r.hgvs_c.as_ref() {
map.insert("hgvsc".into(), Value::String(c.clone()));
}
if let (Some(p), Some(acc)) = (r.hgvs_p.as_ref(), r.protein_accession.as_ref()) {
map.insert("hgvsp".into(), Value::String(format!("{acc}:{p}")));
}
insert_opt_u64(&mut map, "protein_start", r.protein_start.map(u64::from));
insert_opt_u64(&mut map, "protein_end", r.protein_end.map(u64::from));
insert_opt_u64(&mut map, "cds_start", r.cds_position.map(u64::from));
insert_opt_u64(&mut map, "cds_end", r.cds_position_end.map(u64::from));
insert_opt_u64(&mut map, "cdna_start", r.cdna_position.map(u64::from));
insert_opt_u64(&mut map, "cdna_end", r.cdna_position_end.map(u64::from));
insert_opt_str(&mut map, "amino_acids", r.amino_acids.as_deref());
insert_opt_str(&mut map, "codons", r.codons.as_deref());
insert_opt_str(&mut map, "exon", r.exon.as_deref());
insert_opt_str(&mut map, "intron", r.intron.as_deref());
if r.is_mane_select {
map.insert("mane_select".into(), Value::String(r.transcript.clone()));
}
if r.is_mane_plus_clinical {
map.insert(
"mane_plus_clinical".into(),
Value::String(r.transcript.clone()),
);
}
Value::Object(map)
}
fn insert_opt_u64(map: &mut Map<String, Value>, key: &str, value: Option<u64>) {
if let Some(v) = value {
map.insert(key.into(), Value::from(v));
}
}
fn insert_opt_str(map: &mut Map<String, Value>, key: &str, value: Option<&str>) {
if let Some(v) = value {
map.insert(key.into(), Value::String(v.to_string()));
}
}
fn render_allele(bytes: &[u8]) -> String {
if bytes.is_empty() {
"-".to_string()
} else {
String::from_utf8_lossy(bytes).into_owned()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::consequence::Impact;
use crate::types::Biotype;
fn make_result(transcript: &str, gene: &str) -> ConsequenceResult {
ConsequenceResult {
transcript: transcript.to_string(),
gene_symbol: gene.to_string(),
protein_accession: None,
consequences: Vec::new(),
impact: Impact::Modifier,
protein_start: None,
protein_end: None,
codons: None,
amino_acids: None,
exon: None,
intron: None,
cds_position: None,
cds_position_end: None,
cdna_position: None,
cdna_position_end: None,
strand: Strand::Plus,
biotype: Biotype::ProteinCoding,
is_mane_select: false,
is_mane_plus_clinical: false,
is_refseq_select: false,
hgvs_c: None,
hgvs_p: None,
predicts_nmd: false,
}
}
#[test]
fn vep_json_snv_missense_full_shape() {
let r = ConsequenceResult {
consequences: vec![Consequence::MissenseVariant],
impact: Impact::Moderate,
protein_start: Some(248),
protein_end: Some(248),
codons: Some("Cgg/Tgg".to_string()),
amino_acids: Some("R/W".to_string()),
exon: Some("7/11".to_string()),
cds_position: Some(742),
cds_position_end: Some(742),
cdna_position: Some(884),
cdna_position_end: Some(884),
strand: Strand::Minus,
is_mane_select: true,
hgvs_c: Some("NM_000546.6:c.742C>T".to_string()),
hgvs_p: Some("p.Arg248Trp".to_string()),
protein_accession: Some("NP_000537.3".to_string()),
..make_result("NM_000546.6", "TP53")
};
let json = to_vep_json_array("chr17", 7_674_219, b"C", b"T", "GRCh38", &[r]);
let arr = json.as_array().expect("top-level should be an array");
assert_eq!(arr.len(), 1);
let top = &arr[0];
assert_eq!(top["seq_region_name"], "17");
assert_eq!(top["start"], 7_674_220);
assert_eq!(top["end"], 7_674_220);
assert_eq!(top["strand"], 1);
assert_eq!(top["allele_string"], "C/T");
assert_eq!(top["assembly_name"], "GRCh38");
assert_eq!(top["most_severe_consequence"], "missense_variant");
assert_eq!(top["colocated_variants"].as_array().unwrap().len(), 0);
let tc = &top["transcript_consequences"][0];
assert_eq!(tc["transcript_id"], "NM_000546.6");
assert_eq!(tc["gene_symbol"], "TP53");
assert_eq!(tc["biotype"], "protein_coding");
assert_eq!(tc["strand"], -1);
assert_eq!(tc["impact"], "MODERATE");
assert_eq!(
tc["consequence_terms"],
serde_json::json!(["missense_variant"])
);
assert_eq!(tc["hgvsc"], "NM_000546.6:c.742C>T");
assert_eq!(tc["hgvsp"], "NP_000537.3:p.Arg248Trp");
assert_eq!(tc["protein_start"], 248);
assert_eq!(tc["protein_end"], 248);
assert_eq!(tc["cds_start"], 742);
assert_eq!(tc["cds_end"], 742);
assert_eq!(tc["cdna_start"], 884);
assert_eq!(tc["cdna_end"], 884);
assert_eq!(tc["amino_acids"], "R/W");
assert_eq!(tc["codons"], "Cgg/Tgg");
assert_eq!(tc["exon"], "7/11");
assert_eq!(tc["mane_select"], "NM_000546.6");
}
#[test]
fn vep_json_hgvsp_omitted_without_protein_accession() {
let r = ConsequenceResult {
consequences: vec![Consequence::NonCodingTranscriptExonVariant],
impact: Impact::Modifier,
biotype: Biotype::LncRna,
hgvs_p: Some("p.?".to_string()),
protein_accession: None,
..make_result("NR_046018.2", "DDX11L1")
};
let json = to_vep_json_array("chr1", 11_868, b"G", b"A", "GRCh38", &[r]);
let tc = &json[0]["transcript_consequences"][0];
assert!(
tc.as_object().unwrap().get("hgvsp").is_none(),
"hgvsp should be omitted when protein_accession is None; got: {tc:?}"
);
}
#[test]
fn vep_json_indel_frameshift_allele_string() {
let r = ConsequenceResult {
consequences: vec![Consequence::FrameshiftVariant],
impact: Impact::High,
hgvs_c: Some("NM_006772.2:c.1861_1862del".to_string()),
..make_result("NM_006772.2", "SYNGAP1")
};
let json = to_vep_json_array("chr6", 33_409_450, b"TG", b"", "GRCh38", &[r]);
let top = &json[0];
assert_eq!(top["start"], 33_409_451);
assert_eq!(top["end"], 33_409_452);
assert_eq!(top["allele_string"], "TG/-");
assert_eq!(top["most_severe_consequence"], "frameshift_variant");
let tc = &top["transcript_consequences"][0];
assert_eq!(tc["impact"], "HIGH");
assert_eq!(
tc["consequence_terms"],
serde_json::json!(["frameshift_variant"])
);
assert_eq!(tc["hgvsc"], "NM_006772.2:c.1861_1862del");
}
#[test]
fn vep_json_intergenic_empty_results() {
let json = to_vep_json_array("chr1", 1_000_000, b"A", b"G", "GRCh38", &[]);
let arr = json.as_array().expect("top-level should be an array");
assert_eq!(arr.len(), 1);
let top = &arr[0];
assert_eq!(top["most_severe_consequence"], "intergenic_variant");
assert_eq!(top["transcript_consequences"].as_array().unwrap().len(), 0);
}
#[test]
fn vep_json_most_severe_from_min_severity_rank() {
let r = ConsequenceResult {
consequences: vec![
Consequence::SpliceRegionVariant,
Consequence::MissenseVariant,
],
impact: Impact::Moderate,
..make_result("NM_000546.6", "TP53")
};
let json = to_vep_json_array("chr17", 7_674_219, b"C", b"T", "GRCh38", &[r]);
let top = &json[0];
assert_eq!(top["most_severe_consequence"], "missense_variant");
let tc = &top["transcript_consequences"][0];
assert_eq!(
tc["consequence_terms"],
serde_json::json!(["splice_region_variant", "missense_variant"])
);
}
#[test]
fn vep_json_optional_fields_omitted_for_intron_variant() {
let r = ConsequenceResult {
consequences: vec![Consequence::IntronVariant],
intron: Some("7/10".to_string()),
..make_result("NM_000546.6", "TP53")
};
let json = to_vep_json_array("chr17", 7_675_000, b"A", b"G", "GRCh38", &[r]);
let tc = &json[0]["transcript_consequences"][0];
let obj = tc.as_object().unwrap();
assert_eq!(tc["intron"], "7/10");
for missing_key in [
"exon",
"amino_acids",
"codons",
"protein_start",
"protein_end",
"hgvsp",
"hgvsc",
"cds_start",
"cds_end",
"cdna_start",
"cdna_end",
] {
assert!(
obj.get(missing_key).is_none(),
"{missing_key} should be omitted for an intron variant; got: {tc:?}"
);
}
}
#[test]
fn vep_json_multiple_transcripts_ordering() {
let synonymous = ConsequenceResult {
consequences: vec![Consequence::SynonymousVariant],
impact: Impact::Low,
..make_result("NM_000546.6", "TP53")
};
let missense = ConsequenceResult {
consequences: vec![Consequence::MissenseVariant],
impact: Impact::Moderate,
..make_result("NM_001126112.3", "TP53")
};
let json = to_vep_json_array(
"chr17",
7_674_219,
b"C",
b"T",
"GRCh38",
&[synonymous, missense],
);
let top = &json[0];
assert_eq!(top["most_severe_consequence"], "missense_variant");
let tcs = top["transcript_consequences"].as_array().unwrap();
assert_eq!(tcs.len(), 2);
assert_eq!(tcs[0]["transcript_id"], "NM_000546.6");
assert_eq!(tcs[0]["impact"], "LOW");
assert_eq!(tcs[1]["transcript_id"], "NM_001126112.3");
assert_eq!(tcs[1]["impact"], "MODERATE");
}
}