use anyhow::ensure;
use derive_builder::Builder;
use log::debug;
use rust_lib_reference_genome::reference_genome::ReferenceGenome;
use std::collections::BTreeSet;
use crate::data_types::merge_benchmark::{MergeBenchmark, MergeClassification};
use crate::data_types::multi_region::MultiRegion;
use crate::data_types::phase_enums::PhasedZygosity;
use crate::data_types::variants::Variant;
use crate::query_optimizer::optimize_sequences;
#[derive(Builder, Clone, Copy)]
#[builder(default)]
pub struct MergeConfig {
no_conflict_enabled: bool,
majority_voting_enabled: bool,
conflict_selection: Option<usize>,
max_branch_factor: usize,
}
impl Default for MergeConfig {
fn default() -> Self {
Self {
no_conflict_enabled: false,
majority_voting_enabled: false,
conflict_selection: None,
max_branch_factor: 50,
}
}
}
impl MergeConfig {
pub fn no_conflict_enabled(&self) -> bool {
self.no_conflict_enabled
}
pub fn majority_voting_enabled(&self) -> bool {
self.majority_voting_enabled
}
pub fn conflict_selection(&self) -> Option<usize> {
self.conflict_selection
}
pub fn max_branch_factor(&self) -> usize {
self.max_branch_factor
}
}
pub fn solve_merge_region(problem: &MultiRegion, reference_genome: &ReferenceGenome, merge_strategy: MergeConfig) -> anyhow::Result<MergeBenchmark> {
let problem_id = problem.region_id();
let coordinates = problem.coordinates();
debug!("B#{problem_id} Scanning region #{problem_id}: {coordinates}");
let reference = reference_genome.get_full_chromosome(coordinates.chrom());
let mut delta_scores = vec![];
for (variants, zygs) in problem.variants().iter().zip(problem.zygosity().iter()) {
delta_scores.push(variant_delta_length(variants, zygs)?);
}
let mut all_identical = true;
let mut no_conflict = true;
let mut match_sets: Vec<BTreeSet<usize>> = vec![Default::default(); problem.variants().len()];
for (i, (v1, z1)) in problem.variants().iter().zip(problem.zygosity().iter()).enumerate() {
match_sets[i].insert(i);
for j in (i+1)..problem.variants().len() {
let v2 = &problem.variants()[j];
let z2 = &problem.zygosity()[j];
let exact_match = if delta_scores[i] == delta_scores[j] {
let all_opt_haps = optimize_sequences(
reference, coordinates, v1, z1, v2, z2, merge_strategy.max_branch_factor()
)?;
let basepair_analysis = &all_opt_haps[0];
basepair_analysis.is_exact_match()
} else {
false
};
debug!("B#{problem_id} Comparing inputs {i} and {j}, exact_match = {exact_match}");
all_identical &= exact_match;
no_conflict &= v1.is_empty() || v2.is_empty() ||
exact_match;
if exact_match {
match_sets[i].insert(j);
match_sets[j].insert(i);
}
}
}
let maj_count = match_sets.len() / 2 + 1;
let first_maj = match_sets.into_iter()
.find(|bset| bset.len() >= maj_count)
.map(|bset| bset.into_iter().collect::<Vec<usize>>())
.unwrap_or_default();
let classification = if all_identical {
MergeClassification::BasepairIdentical
} else if merge_strategy.no_conflict_enabled() && no_conflict {
let indices = problem.variants().iter().enumerate()
.filter_map(|(i, v)| {
if v.is_empty() {
None
} else {
Some(i)
}
})
.collect();
MergeClassification::NoConflict {
indices
}
} else if merge_strategy.majority_voting_enabled() && !first_maj.is_empty() {
MergeClassification::MajorityAgree { indices: first_maj }
} else if let Some(v_index) = merge_strategy.conflict_selection() {
MergeClassification::ConflictSelection { index: v_index }
} else {
MergeClassification::Different
};
Ok(MergeBenchmark::new(problem_id, classification))
}
pub fn variant_delta_length(variants: &[Variant], zygosities: &[PhasedZygosity]) -> anyhow::Result<i64> {
let mut total_delta = 0;
ensure!(variants.len() == zygosities.len(), "Variants and zygosities must be the same length");
for (v, z) in variants.iter().zip(zygosities.iter()) {
ensure!(*z != PhasedZygosity::Unknown, "Zygosity cannot be unknown");
let v_delta = v.allele1().len() as i64 - v.allele0().len() as i64;
let z_weight = z.to_allele_count() as i64;
total_delta += v_delta * z_weight;
}
Ok(total_delta)
}
#[cfg(test)]
mod tests {
use crate::data_types::coordinates::Coordinates;
use crate::data_types::phase_enums::PhasedZygosity;
use crate::data_types::variants::Variant;
use super::*;
fn generate_simple_reference() -> ReferenceGenome {
let mut ref_genome = ReferenceGenome::empty_reference();
ref_genome.add_contig(
"mock_chr1".to_string(), "ACCGTTACCAGGACTTGACAAACCG"
).unwrap();
ref_genome
}
#[test]
fn test_exact_mode() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 25);
let variants = vec![
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
];
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
];
let problem = MultiRegion::new(0, coordinates.clone(), variants.clone(), zygosity).unwrap();
let result = solve_merge_region(&problem, &reference_genome, MergeConfig::default()).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::BasepairIdentical);
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::PhasedHet01]
];
let problem = MultiRegion::new(0, coordinates, variants, zygosity).unwrap();
let result = solve_merge_region(&problem, &reference_genome, MergeConfig::default()).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::Different);
}
#[test]
fn test_noconflict_mode() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 25);
let variants = vec![
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![]
];
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
vec![],
];
let problem = MultiRegion::new(0, coordinates.clone(), variants.clone(), zygosity).unwrap();
let merge_config = MergeConfigBuilder::default().no_conflict_enabled(true).build().unwrap();
let result = solve_merge_region(&problem, &reference_genome, merge_config).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::NoConflict { indices: vec![0, 1] });
let variants = vec![
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()]
];
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::PhasedHet01]
];
let problem = MultiRegion::new(0, coordinates, variants, zygosity).unwrap();
let result = solve_merge_region(&problem, &reference_genome, merge_config).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::Different);
}
#[test]
fn test_majority_mode() {
let reference_genome = generate_simple_reference();
let coordinates = Coordinates::new("mock_chr1".to_string(), 0, 25);
let variants = vec![
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![]
];
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![PhasedZygosity::HomozygousAlternate],
vec![],
];
let problem = MultiRegion::new(0, coordinates.clone(), variants.clone(), zygosity).unwrap();
let merge_config = MergeConfigBuilder::default().majority_voting_enabled(true).build().unwrap();
let result = solve_merge_region(&problem, &reference_genome, merge_config).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::MajorityAgree { indices: vec![0, 1] });
let variants = vec![
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()],
vec![],
vec![Variant::new_snv(0, 5, b"T".to_vec(), b"C".to_vec()).unwrap()]
];
let zygosity = vec![
vec![PhasedZygosity::HomozygousAlternate],
vec![],
vec![PhasedZygosity::PhasedHet01]
];
let problem = MultiRegion::new(0, coordinates, variants, zygosity).unwrap();
let result = solve_merge_region(&problem, &reference_genome, merge_config).unwrap();
assert_eq!(result.merge_classification(), &MergeClassification::Different);
}
#[test]
fn test_variant_length_delta() {
let variants = [
Variant::new_snv(0, 10, b"A".to_vec(), b"C".to_vec()).unwrap(),
Variant::new_deletion(0, 12, b"ACGTACGT".to_vec(), b"A".to_vec()).unwrap(),
Variant::new_insertion(0, 25, b"A".to_vec(), b"ACC".to_vec()).unwrap()
];
let zygs = [
PhasedZygosity::HomozygousAlternate,
PhasedZygosity::PhasedHet01,
PhasedZygosity::HomozygousAlternate,
];
assert_eq!(variant_delta_length(&variants[0..1], &zygs[0..1]).unwrap(), 0);
assert_eq!(variant_delta_length(&variants[1..2], &zygs[1..2]).unwrap(), -7);
assert_eq!(variant_delta_length(&variants[2..3], &zygs[2..3]).unwrap(), 4);
assert_eq!(variant_delta_length(&variants, &zygs).unwrap(), -3);
}
}