use std::{
collections::{HashMap, HashSet},
fmt::Debug,
sync::Arc,
};
use bstr::ByteSlice;
use rust_htslib::bcf::{self, record::GenotypeAllele};
use tracing::trace;
use super::cluster::ClusteringSettings;
use crate::counter;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct GtAndPhase {
pub alleles: [Option<u32>; 2],
pub phase: Option<i32>,
}
const UNKNOWN: GtAndPhase = GtAndPhase {
alleles: [None, None],
phase: None,
};
impl GtAndPhase {
pub fn has_missing(&self) -> bool {
self.alleles.iter().any(Option::is_none)
}
pub fn is_hom_ref(&self) -> bool {
self.alleles.iter().all(|a| *a == Some(0))
}
pub fn is_hom(&self) -> bool {
self.alleles[0] == self.alleles[1]
}
pub fn is_phased(&self) -> bool {
self.phase.is_some()
}
pub fn is_unphased_het(&self) -> bool {
!self.is_phased() && !self.is_hom()
}
}
impl From<&bcf::Record> for GtAndPhase {
fn from(record: &bcf::Record) -> Self {
let Ok(genotypes) = record.genotypes() else {
return UNKNOWN;
};
let gt = genotypes.get(0);
if gt.len() != 2 {
return UNKNOWN;
}
let [a0, a1] = [gt[0], gt[1]];
let phase = if let GenotypeAllele::Phased(..) | GenotypeAllele::PhasedMissing = a1 {
Some(read_phaseset(record))
} else {
None
};
GtAndPhase {
alleles: [a0.index(), a1.index()],
phase,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum Haplotype {
H0,
H1,
Both,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct OutputPhasing {
pub alleles: [u32; 2],
pub phaseset: Option<i32>,
}
impl OutputPhasing {
pub fn from_subcluster(haplo: Haplotype, phaseset: Option<i32>) -> Self {
let alleles = match haplo {
Haplotype::H0 => [1, 0], Haplotype::H1 => [0, 1], Haplotype::Both => [1, 1], };
Self { alleles, phaseset }
}
pub const fn is_phased(&self) -> bool {
self.phaseset.is_some()
}
pub const fn is_hom(&self) -> bool {
self.alleles[0] == self.alleles[1]
}
}
pub fn has_unphased_het(records: &[Arc<bcf::Record>]) -> bool {
records
.iter()
.any(|r| GtAndPhase::from(&**r).is_unphased_het())
}
type RecordAllele = (Arc<bcf::Record>, u32);
pub struct HaplotypeSubCluster {
pub records: Vec<RecordAllele>,
pub phaseset: Option<i32>,
pub haplo: Haplotype,
}
impl Debug for HaplotypeSubCluster {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HaplotypeSubCluster")
.field(
"records",
&self
.records
.iter()
.map(|(r, a)| {
let alleles = r.alleles();
let ra = alleles[0].as_bstr().to_owned();
let aa = alleles[*a as usize].as_bstr().to_owned();
format!("{}: {ra} -> {aa}", r.pos() + 1,)
})
.collect::<Vec<_>>(),
)
.field("phaseset", &self.phaseset)
.field("haplo", &self.haplo)
.finish()
}
}
fn read_phaseset(record: &bcf::Record) -> i32 {
record
.format(b"PS")
.integer()
.ok()
.and_then(|data| data.first().and_then(|s| s.first().copied()))
.unwrap_or(i32::MIN)
}
pub fn split_into_haplotype_clusters(
proximity_cluster: &[Arc<bcf::Record>],
settings: &ClusteringSettings,
) -> Vec<HaplotypeSubCluster> {
if proximity_cluster.len() == 1 {
let gtp = GtAndPhase::from(&*proximity_cluster[0]);
if gtp.is_hom_ref() {
counter!("clusters.unresolvable.hom_ref").inc(1);
return vec![];
}
if let GtAndPhase {
alleles: [Some(a0), Some(a1)],
phase,
} = gtp
&& a0 == a1
{
return vec![HaplotypeSubCluster {
records: proximity_cluster
.iter()
.map(|r| (Arc::clone(r), a0))
.collect(),
phaseset: phase,
haplo: Haplotype::Both,
}];
}
let GtAndPhase {
alleles: [a0, a1],
phase,
} = gtp;
let mut result = Vec::new();
if let Some(a0) = a0
&& a0 > 0
{
result.push(HaplotypeSubCluster {
records: proximity_cluster
.iter()
.map(|r| (Arc::clone(r), a0))
.collect(),
phaseset: phase,
haplo: Haplotype::H0,
});
}
if let Some(a1) = a1
&& a1 > 0
{
result.push(HaplotypeSubCluster {
records: proximity_cluster
.iter()
.map(|r| (Arc::clone(r), a1))
.collect(),
phaseset: phase,
haplo: Haplotype::H1,
});
}
return result;
}
let (has_unphased, has_missing) =
proximity_cluster
.iter()
.fold((false, false), |(unphased, missing), r| {
let gtp = GtAndPhase::from(&**r);
(
unphased | gtp.is_unphased_het(),
missing | gtp.has_missing(),
)
});
if has_unphased || has_missing {
if has_unphased {
counter!("clusters.unresolvable.unphased_het").inc(1);
} else {
counter!("clusters.unresolvable.missing_allele").inc(1);
}
return vec![];
}
let iter = proximity_cluster.iter().map(|r| GtAndPhase::from(&**r));
debug_assert!(iter.clone().all(|gtp| !gtp.has_missing()));
debug_assert!(iter.clone().all(|gtp| !gtp.is_unphased_het()));
debug_assert!(iter.clone().all(|gtp| gtp.is_hom() || gtp.is_phased()));
let mut phased_groups: HashMap<(i32, Haplotype), Vec<RecordAllele>> = HashMap::new();
let mut hom_alt: Vec<RecordAllele> = Vec::new();
for record in proximity_cluster {
match GtAndPhase::from(&**record) {
GtAndPhase {
alleles: [Some(a0), Some(a1)],
phase: Some(phase),
} if a0 != a1 => {
if a0 > 0 {
phased_groups
.entry((phase, Haplotype::H0))
.or_default()
.push((Arc::clone(record), a0));
}
if a1 > 0 {
phased_groups
.entry((phase, Haplotype::H1))
.or_default()
.push((Arc::clone(record), a1));
}
}
gtp @ GtAndPhase {
alleles: [Some(a), ..],
..
} if a > 0 && gtp.is_hom() => {
hom_alt.push((Arc::clone(record), a));
}
_ => {}
}
}
let mut result = Vec::new();
if phased_groups.is_empty() {
for sc in into_proximity_sub_clusters(&hom_alt, settings) {
result.push(HaplotypeSubCluster {
records: sc,
phaseset: None,
haplo: Haplotype::Both,
});
}
} else {
let phasesets: HashSet<i32> = phased_groups.keys().map(|(ps, _)| *ps).collect();
if !hom_alt.is_empty() {
counter!("phasing.records.homalt_duplicated").inc(hom_alt.len());
}
for ps in phasesets {
for hap in [Haplotype::H0, Haplotype::H1] {
let mut combined = phased_groups.remove(&(ps, hap)).unwrap_or_default();
combined.extend(hom_alt.iter().cloned());
combined.sort_by_key(|(r, _)| r.pos());
for sc in into_proximity_sub_clusters(&combined, settings) {
result.push(HaplotypeSubCluster {
records: sc,
phaseset: Some(ps),
haplo: hap,
});
}
}
}
}
if result.is_empty() {
counter!("clusters.unresolvable.no_valid_subcluster").inc(1);
}
trace!("Phasing result: {result:#?}");
result
}
fn into_proximity_sub_clusters(
candidates: &[RecordAllele],
settings: &ClusteringSettings,
) -> Vec<Vec<RecordAllele>> {
let mut result = Vec::new();
let mut current: Vec<RecordAllele> = Vec::new();
for record in candidates {
if settings
.belongs(current.last().map(|(r, _)| r.clone()), record.0.clone())
.unwrap_or(false)
{
current.push(record.clone());
} else {
if settings.is_cluster(¤t, |r| &*r.0, |r| Some(r.1)) {
result.push(std::mem::take(&mut current));
} else {
current.clear();
}
current.push(record.clone());
}
}
if settings.is_cluster(¤t, |r| &*r.0, |r| Some(r.1)) {
result.push(current);
}
result
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use rust_htslib::bcf::record::GenotypeAllele;
use rust_htslib::bcf::{self, Header, Writer};
use super::*;
fn make_header() -> Header {
let mut h = Header::new();
h.push_record(b"##contig=<ID=chr1,length=1000000>");
h.push_record(b"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
h.push_record(b"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase set\">");
h.push_sample(b"S1");
h
}
fn make_writer(header: &Header) -> Writer {
Writer::from_stdout(header, true, bcf::Format::Vcf).unwrap()
}
fn make_record(
writer: &Writer,
pos: i64,
alleles: &[&[u8]],
gt: &[GenotypeAllele],
ps: Option<i32>,
) -> bcf::Record {
let mut rec = writer.empty_record();
let rid = rec.header().name2rid(b"chr1").unwrap();
rec.set_rid(Some(rid));
rec.set_pos(pos);
rec.set_alleles(alleles).unwrap();
rec.push_genotypes(gt).unwrap();
if let Some(p) = ps {
rec.push_format_integer(b"PS", &[p]).unwrap();
}
rec
}
#[test]
fn classify_hom_ref_unphased() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(0)],
None,
);
assert!(GtAndPhase::from(&rec).is_hom_ref());
}
#[test]
fn classify_hom_ref_phased() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(0)],
None,
);
assert!(GtAndPhase::from(&rec).is_hom_ref());
}
#[test]
fn classify_hom_alt_unphased() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
None,
);
let gtp = GtAndPhase::from(&rec);
assert!(gtp.is_hom() && !gtp.is_hom_ref());
}
#[test]
fn classify_hom_alt_phased() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(1)],
Some(100),
);
let gtp = GtAndPhase::from(&rec);
assert!(gtp.is_hom() && !gtp.is_hom_ref());
}
#[test]
fn classify_phased_het_0_1() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)],
Some(100),
);
let mut settings = default_settings();
settings.min_records = 1;
let sub = split_into_haplotype_clusters(&[Arc::new(rec)], &settings);
assert_eq!(sub.len(), 1);
assert_eq!(sub[0].haplo, Haplotype::H1);
assert_eq!(sub[0].phaseset, Some(100));
}
#[test]
fn classify_phased_het_1_0() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)],
Some(100),
);
let mut settings = default_settings();
settings.min_records = 1;
let sub = split_into_haplotype_clusters(&[Arc::new(rec)], &settings);
assert_eq!(sub.len(), 1);
assert_eq!(sub[0].haplo, Haplotype::H0);
assert_eq!(sub[0].phaseset, Some(100));
}
#[test]
fn classify_unphased_het() {
let h = make_header();
let w = make_writer(&h);
let rec = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
None,
);
assert!(GtAndPhase::from(&rec).is_unphased_het());
}
fn default_settings() -> ClusteringSettings {
ClusteringSettings::default()
}
#[test]
fn split_all_same_haplotype() {
let h = make_header();
let w = make_writer(&h);
let alleles: &[&[u8]] = &[b"A", b"T"];
let gt = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
let records: Vec<Arc<bcf::Record>> = vec![
Arc::new(make_record(&w, 100, alleles, gt, Some(100))),
Arc::new(make_record(&w, 105, alleles, gt, Some(100))),
Arc::new(make_record(&w, 110, alleles, gt, Some(100))),
];
let sub = split_into_haplotype_clusters(&records, &default_settings());
assert_eq!(sub.len(), 1);
let h1 = sub.iter().find(|sc| sc.haplo == Haplotype::H1).unwrap();
assert_eq!(h1.phaseset, Some(100));
assert_eq!(h1.records.len(), 3);
}
#[test]
fn split_mixed_haplotypes_with_hom_alt() {
let h = make_header();
let w = make_writer(&h);
let r01 = Arc::new(make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)],
Some(100),
));
let r10 = Arc::new(make_record(
&w,
105,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)],
Some(100),
));
let r11 = Arc::new(make_record(
&w,
108,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
None,
));
let mut settings = default_settings();
settings.min_records = 1;
let records = vec![r01, r10, r11];
let sub = split_into_haplotype_clusters(&records, &settings);
let haps: Vec<Haplotype> = sub.iter().map(|sc| sc.haplo).collect();
assert!(haps.contains(&Haplotype::H0));
assert!(haps.contains(&Haplotype::H1));
assert!(
!haps.contains(&Haplotype::Both),
"split mode emits no `Both`"
);
let containing: Vec<&HaplotypeSubCluster> = sub
.iter()
.filter(|sc| sc.records.iter().any(|(r, _)| r.pos() == 108))
.collect();
assert_eq!(containing.len(), 2, "hom-alt folds onto both haplotypes");
let containing_haps: Vec<Haplotype> = containing.iter().map(|sc| sc.haplo).collect();
assert!(containing_haps.contains(&Haplotype::H0));
assert!(containing_haps.contains(&Haplotype::H1));
let h0 = sub.iter().find(|sc| sc.haplo == Haplotype::H0).unwrap();
assert!(h0.records.iter().any(|(r, _)| r.pos() == 105));
let h1 = sub.iter().find(|sc| sc.haplo == Haplotype::H1).unwrap();
assert!(h1.records.iter().any(|(r, _)| r.pos() == 100));
}
#[test]
fn hom_alt_context_rescues_gap_split() {
let h = make_header();
let w = make_writer(&h);
let gt10 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)];
let gt11 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
let r1 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt10, Some(100)));
let rhom = Arc::new(make_record(&w, 118, &[b"A", b"T"], gt11, None));
let r2 = Arc::new(make_record(&w, 136, &[b"A", b"T"], gt10, Some(100)));
let mut settings = default_settings();
settings.min_records = 1;
settings.min_density = 0.0; let sub = split_into_haplotype_clusters(&[r1, rhom, r2], &settings);
let h0: Vec<_> = sub.iter().filter(|sc| sc.haplo == Haplotype::H0).collect();
assert_eq!(
h0.len(),
1,
"hom-alt context keeps H0 hets in one sub-cluster"
);
assert_eq!(h0[0].records.len(), 3);
assert!(!sub.iter().any(|sc| sc.haplo == Haplotype::Both));
}
#[test]
fn pure_hom_alt_emits_both_once() {
let h = make_header();
let w = make_writer(&h);
let gt11 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
let r1 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt11, None));
let r2 = Arc::new(make_record(&w, 108, &[b"A", b"T"], gt11, None));
let mut settings = default_settings();
settings.min_records = 1;
let sub = split_into_haplotype_clusters(&[r1, r2], &settings);
assert!(sub.iter().all(|sc| sc.haplo == Haplotype::Both));
for pos in [100, 108] {
let n = sub
.iter()
.filter(|sc| sc.records.iter().any(|(r, _)| r.pos() == pos))
.count();
assert_eq!(n, 1, "pure hom-alt must emit exactly once");
}
}
#[test]
fn hom_alt_split_lone_on_other_hap() {
let h = make_header();
let w = make_writer(&h);
let gt10 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)];
let gt01 = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
let gt11 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
let r_h0 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt10, Some(100)));
let r_hom = Arc::new(make_record(&w, 110, &[b"A", b"T"], gt11, None));
let r_h1 = Arc::new(make_record(&w, 500, &[b"A", b"T"], gt01, Some(100)));
let mut settings = default_settings();
settings.min_records = 1;
settings.min_density = 0.0; let sub = split_into_haplotype_clusters(&[r_h0, r_hom, r_h1], &settings);
assert!(!sub.iter().any(|sc| sc.haplo == Haplotype::Both));
let h0 = sub.iter().find(|sc| sc.haplo == Haplotype::H0).unwrap();
assert!(h0.records.iter().any(|(r, _)| r.pos() == 100));
assert!(h0.records.iter().any(|(r, _)| r.pos() == 110));
assert!(
sub.iter()
.any(|sc| sc.haplo == Haplotype::H1
&& sc.records.iter().any(|(r, _)| r.pos() == 110))
);
assert!(
sub.iter()
.any(|sc| sc.haplo == Haplotype::H1
&& sc.records.iter().any(|(r, _)| r.pos() == 500))
);
}
#[test]
fn multi_phaseset_hom_alt_split() {
let h = make_header();
let w = make_writer(&h);
let gt01 = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
let gt11 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)];
let ps_a = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt01, Some(100)));
let hom = Arc::new(make_record(&w, 110, &[b"A", b"T"], gt11, None));
let ps_b = Arc::new(make_record(&w, 120, &[b"A", b"T"], gt01, Some(999)));
let mut settings = default_settings();
settings.min_records = 1;
settings.min_density = 0.0; let sub = split_into_haplotype_clusters(&[ps_a, hom, ps_b], &settings);
assert!(!sub.iter().any(|sc| sc.haplo == Haplotype::Both));
let psets: Vec<i32> = sub
.iter()
.filter(|sc| sc.haplo == Haplotype::H1)
.filter_map(|sc| sc.phaseset)
.collect();
assert!(psets.contains(&100));
assert!(psets.contains(&999));
let in_h1 = sub
.iter()
.filter(|sc| {
sc.haplo == Haplotype::H1 && sc.records.iter().any(|(r, _)| r.pos() == 110)
})
.count();
assert!(in_h1 >= 1);
}
#[test]
fn split_unphased_het_with_phased_blocks_cluster() {
let h = make_header();
let w = make_writer(&h);
let r01 = Arc::new(make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)],
Some(100),
));
let r_unphased = Arc::new(make_record(
&w,
105,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
None,
));
let records = vec![r01, r_unphased];
let before = counter!("clusters.unresolvable.unphased_het").get();
let sub = split_into_haplotype_clusters(&records, &default_settings());
assert!(sub.is_empty(), "UnphasedHet + Phased should block");
assert_eq!(
counter!("clusters.unresolvable.unphased_het").get() - before,
1
);
}
#[test]
fn split_two_unphased_hets_blocks_cluster() {
let h = make_header();
let w = make_writer(&h);
let gt_unphased = &[GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)];
let r1 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt_unphased, None));
let r2 = Arc::new(make_record(&w, 105, &[b"A", b"T"], gt_unphased, None));
let records = vec![r1, r2];
let sub = split_into_haplotype_clusters(&records, &default_settings());
assert!(sub.is_empty(), "two UnphasedHet records should block");
}
#[test]
fn split_single_unphased_het_is_realigned() {
let h = make_header();
let w = make_writer(&h);
let mut settings = default_settings();
settings.min_records = 1;
let r = make_record(
&w,
100,
&[b"A", b"T"],
&[GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
None,
);
let sub = split_into_haplotype_clusters(&[Arc::new(r)], &settings);
assert_eq!(sub.len(), 1);
assert_eq!(sub[0].haplo, Haplotype::H1);
assert!(sub[0].phaseset.is_none());
}
#[test]
fn split_different_phasesets() {
let h = make_header();
let w = make_writer(&h);
let gt = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
let r1 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt, Some(100)));
let r2 = Arc::new(make_record(&w, 105, &[b"A", b"T"], gt, Some(999)));
let mut settings = default_settings();
settings.min_records = 1; let records = vec![r1, r2];
let sub = split_into_haplotype_clusters(&records, &settings);
let psets: Vec<i32> = sub.iter().filter_map(|sc| sc.phaseset).collect();
assert!(psets.contains(&100));
assert!(psets.contains(&999));
}
#[test]
fn split_gap_introduced_by_removing_other_haplotype() {
let h = make_header();
let w = make_writer(&h);
let gt01 = &[GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)];
let gt10 = &[GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)];
let r1 = Arc::new(make_record(&w, 100, &[b"A", b"T"], gt01, Some(100)));
let r2 = Arc::new(make_record(&w, 110, &[b"A", b"T"], gt10, Some(100)));
let r3 = Arc::new(make_record(&w, 200, &[b"A", b"T"], gt01, Some(100)));
let mut settings = default_settings();
settings.min_records = 1;
let records = vec![r1, r2, r3];
let sub = split_into_haplotype_clusters(&records, &settings);
let hap1_clusters: Vec<_> = sub.iter().filter(|sc| sc.haplo == Haplotype::H1).collect();
assert_eq!(
hap1_clusters.len(),
2,
"gap should split hap=1 into two sub-clusters"
);
}
}