use std::{mem, ops::Range};
use anyhow::bail;
use lib_tsalign::a_star_aligner::{
alignment_result::{IAlignmentType, alignment::Alignment},
template_switch_distance::{AlignmentType, TemplateSwitchPrimary},
};
use rust_htslib::bcf::Record;
use tracing::error;
use crate::{
common::coords::GenomePosition,
vcf::{
pipeline::{message::MaskedRecords, writer::RecordProperties},
strings,
},
};
mod statistics;
pub fn create_records<I>(
alignment: I,
reference_sequence: (usize, &[u8]),
query_sequence: (usize, &[u8]),
old_records: Option<MaskedRecords>,
empty_record: &Record,
ref_start: &GenomePosition,
) -> anyhow::Result<Vec<(Record, RecordProperties)>>
where
I: Iterator<Item = (usize, AlignmentType)>,
{
let candidates = {
let mut producer =
CandidateProducer::new(alignment, reference_sequence.0, query_sequence.0);
std::iter::from_fn(|| producer.next().transpose()).collect::<anyhow::Result<Vec<_>>>()?
};
let mut leading_matches = Some({
let len = reference_sequence.0.min(query_sequence.0);
let r_range = (reference_sequence.0 - len)..reference_sequence.0;
let q_range = (query_sequence.0 - len)..query_sequence.0;
VariantCandidate {
r_range,
q_range,
align: vec![(len, AlignmentType::PrimaryMatch)],
}
});
let mut reverse_candidates_iterator = candidates.into_iter().rev().map(Some).peekable();
let mut normalized_candidates = Vec::new();
loop {
let Some(this) = reverse_candidates_iterator.next() else {
break;
};
let Some(mut this) = this else {
continue;
};
while !this.is_normalized() {
let prev = {
while reverse_candidates_iterator
.next_if(std::option::Option::is_none)
.is_some()
{}
reverse_candidates_iterator.peek_mut()
}
.unwrap_or(&mut leading_matches);
let Some(transfer_unit) = prev.as_mut().and_then(VariantCandidate::pop_unit_back)
else {
bail!("Implementation error")
};
this.push_to_front(transfer_unit)?;
if prev.as_ref().is_some_and(VariantCandidate::is_empty) {
*prev = None;
}
}
if this
.align
.iter()
.any(|(_, ty)| !matches!(ty, AlignmentType::PrimaryMatch))
{
normalized_candidates.push(this);
}
}
let records: Vec<(Record, RecordProperties)> = normalized_candidates
.into_iter()
.rev() .map(|candidate| {
candidate_to_bcf_record(
empty_record.clone(), ref_start,
reference_sequence.1,
query_sequence.1,
candidate,
old_records,
)
})
.inspect(|r| {
if let Err(e) = r {
error!("{e}");
}
})
.flatten()
.collect();
Ok(records)
}
fn candidate_to_bcf_record(
mut template: Record,
ref_start: &GenomePosition,
ref_sequence: &[u8],
alt_sequence: &[u8],
candidate: VariantCandidate,
old_records: Option<MaskedRecords>,
) -> anyhow::Result<(Record, RecordProperties)> {
let rid = template.header().name2rid(ref_start.contig().as_ref())?;
let vcf_start_pos_0 = ref_start.position_0() + candidate.r_range.start;
let properties = RecordProperties::Realigned {
has_ts: candidate.has_ts(),
};
template.push_info_string(
strings::VCF_TS_CIGARETS_KEY.as_bytes(),
&[vcf_encode_info_string(candidate.cigar()).as_bytes()],
)?;
template.set_rid(Some(rid));
template.set_pos(i64::try_from(vcf_start_pos_0)?);
template.set_alleles(&[
&ref_sequence[candidate.r_range],
&alt_sequence[candidate.q_range],
])?;
statistics::compute_statistics(&mut template, old_records)?;
Ok((template, properties))
}
fn vcf_encode_info_string(mut string: String) -> String {
const REPLACEMENTS: [(&str, &str); 4] =
[("%", "%25"), (";", "%3B"), ("=", "%3D"), (",", "%2C")];
for (from, to) in REPLACEMENTS {
string = string.replace(from, to);
}
string
}
#[allow(unused, reason = "used in tests")]
fn vcf_decode_info_string(mut string: String) -> String {
const REPLACEMENTS: [(&str, &str); 4] =
[("%3B", ";"), ("%3D", "="), ("%2C", ","), ("%25", "%")];
for (from, to) in REPLACEMENTS {
string = string.replace(from, to);
}
string
}
#[derive(Debug)]
struct VariantCandidate {
r_range: Range<usize>,
q_range: Range<usize>,
align: Vec<(usize, AlignmentType)>,
}
impl VariantCandidate {
fn is_normalized(&self) -> bool {
!self.r_range.is_empty() && !self.q_range.is_empty() && !self.align.is_empty()
}
fn has_ts(&self) -> bool {
for (_, ty) in &self.align {
if let AlignmentType::TemplateSwitchEntrance { .. } = ty {
return true;
}
}
false
}
fn is_empty(&self) -> bool {
self.align.is_empty()
}
fn pop_unit_back(&mut self) -> Option<Self> {
match self.align.last_mut() {
Some((n, ty)) if ty.is_repeatable() => {
let ty = *ty;
*n -= 1;
if *n == 0 {
self.align.pop();
}
if consumes_query(&ty) {
self.q_range.end -= 1;
}
if consumes_reference(&ty) {
self.r_range.end -= 1;
}
Some(Self {
r_range: self.r_range.end
..self.r_range.end + usize::from(consumes_reference(&ty)),
q_range: self.q_range.end..self.q_range.end + usize::from(consumes_query(&ty)),
align: vec![(1, ty)],
})
}
Some((_, AlignmentType::TemplateSwitchExit { anti_primary_gap })) => {
let anti_primary_gap = *anti_primary_gap;
let mut i = self.align.len() - 1;
let mut consumed_reference = 0;
let mut consumed_query = 0;
loop {
let (n, ty) = self.align[i];
if matches!(ty, AlignmentType::TemplateSwitchEntrance { .. }) {
break;
}
if consumes_reference(&ty) {
consumed_reference += n;
}
if consumes_query(&ty) {
consumed_query += n;
}
i -= 1;
}
let ts = self.align.split_off(i);
let old_ref_end = self.r_range.end;
let old_qry_end = self.q_range.end;
let AlignmentType::TemplateSwitchEntrance { primary, .. } = ts[0].1 else {
error!("implementation error");
return None;
};
match primary {
TemplateSwitchPrimary::Reference => {
self.r_range.end -= consumed_reference;
self.q_range.end = old_qry_end.saturating_add_signed(-anti_primary_gap);
}
TemplateSwitchPrimary::Query => {
self.r_range.end = old_ref_end.saturating_add_signed(-anti_primary_gap);
self.q_range.end -= consumed_query;
}
}
Some(Self {
r_range: self.r_range.end..old_ref_end,
q_range: self.q_range.end..old_qry_end,
align: ts,
})
}
unexpected => {
error!("Implementation error! {unexpected:?}");
None
}
}
}
fn push_to_front(&mut self, mut other: Self) -> anyhow::Result<()> {
if self.r_range.start != other.r_range.end {
bail!(
"Implementation error, tried to combine alignments with hole, self: {self:?} with align <> other: {other:?} with align <>"
);
}
self.r_range.start = other.r_range.start;
self.q_range.start = other.q_range.start;
match (self.align.first_mut(), other.align.last()) {
(Some((s_n, s_ty)), Some((o_n, o_ty))) if s_ty.is_repeated(o_ty) => {
*s_n += *o_n;
other.align.pop();
}
_ => {}
}
other.align.append(&mut self.align);
mem::swap(&mut self.align, &mut other.align);
Ok(())
}
fn cigar(&self) -> String {
let mut aln = Alignment::new();
for &(n, ty) in &self.align {
for _ in 0..n {
aln.push(ty);
}
}
aln.cigar().replace('=', "M") }
}
struct CandidateProducer<I> {
alignment: I,
r_index: usize,
q_index: usize,
}
impl<I> CandidateProducer<I>
where
I: Iterator<Item = (usize, AlignmentType)>,
{
fn new(alignment: I, r_index: usize, q_index: usize) -> Self {
Self {
alignment,
r_index,
q_index,
}
}
fn next(&mut self) -> anyhow::Result<Option<VariantCandidate>> {
let Some((n, ty)) = self.alignment.next() else {
return Ok(None);
};
let ret = match ty {
AlignmentType::PrimaryMatch
| AlignmentType::PrimarySubstitution
| AlignmentType::PrimaryInsertion
| AlignmentType::PrimaryDeletion => self.consume_linear(n, ty),
entrance @ AlignmentType::TemplateSwitchEntrance { .. } => {
self.consume_ts_naive(entrance)?
}
AlignmentType::Root | AlignmentType::PrimaryReentry => return Ok(None),
_ => bail!("Implementation error"),
};
Ok(Some(ret))
}
fn consume_linear(&mut self, n: usize, ty: AlignmentType) -> VariantCandidate {
let r_end = if consumes_reference(&ty) {
self.r_index + n
} else {
self.r_index
};
let q_end = if consumes_query(&ty) {
self.q_index + n
} else {
self.q_index
};
let record = VariantCandidate {
r_range: self.r_index..r_end,
q_range: self.q_index..q_end,
align: vec![(n, ty)],
};
self.r_index = r_end;
self.q_index = q_end;
record
}
fn consume_ts_naive(&mut self, entrance: AlignmentType) -> anyhow::Result<VariantCandidate> {
let AlignmentType::TemplateSwitchEntrance { primary, .. } = entrance else {
bail!("Implementation error");
};
let mut align = vec![(1, entrance)];
let mut r_end = self.r_index;
let mut q_end = self.q_index;
let mut step_in_ts = |n, ty| {
align.push((n, ty));
if consumes_reference(&ty) {
r_end += n;
}
if consumes_query(&ty) {
q_end += n;
}
};
for (n, ty) in self.alignment.by_ref() {
match ty {
AlignmentType::SecondaryInsertion
| AlignmentType::SecondaryDeletion
| AlignmentType::SecondarySubstitution
| AlignmentType::SecondaryMatch => step_in_ts(n, ty),
exit @ AlignmentType::TemplateSwitchExit { anti_primary_gap } => {
align.push((1, exit));
match primary {
TemplateSwitchPrimary::Reference => {
q_end = self.q_index.saturating_add_signed(anti_primary_gap);
}
TemplateSwitchPrimary::Query => {
r_end = self.r_index.saturating_add_signed(anti_primary_gap);
}
}
let r_range = self.r_index..r_end;
let q_range = self.q_index..q_end;
self.r_index = r_end;
self.q_index = q_end;
return Ok(VariantCandidate {
r_range,
q_range,
align,
});
}
AlignmentType::SecondaryRoot => {}
_ => bail!("Implementation error"),
}
}
bail!("Unexpected end of alignment without TS Exit...");
}
}
fn consumes(ty: &AlignmentType) -> Option<(bool, bool)> {
match ty {
AlignmentType::PrimaryInsertion
| AlignmentType::PrimaryFlankInsertion
| AlignmentType::SecondaryInsertion => Some((false, true)),
AlignmentType::PrimaryDeletion
| AlignmentType::PrimaryFlankDeletion
| AlignmentType::SecondaryDeletion => Some((true, false)),
AlignmentType::PrimarySubstitution
| AlignmentType::PrimaryFlankSubstitution
| AlignmentType::PrimaryMatch
| AlignmentType::PrimaryFlankMatch
| AlignmentType::SecondarySubstitution
| AlignmentType::SecondaryMatch => Some((true, true)),
AlignmentType::Root | AlignmentType::SecondaryRoot | AlignmentType::PrimaryReentry => {
Some((false, false))
}
AlignmentType::TemplateSwitchEntrance { .. }
| AlignmentType::TemplateSwitchExit { .. }
| AlignmentType::PrimaryShortcut { .. } => None,
}
}
fn consumes_reference(ty: &AlignmentType) -> bool {
consumes(ty).is_some_and(|(r, _)| r)
}
fn consumes_query(ty: &AlignmentType) -> bool {
consumes(ty).is_some_and(|(_, q)| q)
}
#[cfg(test)]
mod tests {
use crate::{common::contig::ContigName, vcf::augment_header};
use super::*;
use anyhow::Error;
use bstr::ByteSlice;
use lib_tsalign::a_star_aligner::{
alignment_geometry::{AlignmentCoordinates, AlignmentRange},
alignment_result::AlignmentResult,
configurable_a_star_align::Aligner,
template_switch_distance::AlignmentType,
};
use rust_htslib::bcf::{self, Header, Read, Writer};
fn get_dummy_record_and_genotype() -> (Writer, Vec<bcf::record::GenotypeAllele>) {
let mut bcf_reader = bcf::Reader::from_path("./test_files/test.vcf").unwrap();
let mut header = Header::from_template(bcf_reader.header());
augment_header(&mut header, false);
let writer = Writer::from_stdout(&header, true, bcf::Format::Vcf).unwrap();
if let Some(Ok(record)) = bcf_reader.records().next() {
let genotype_alleles = record
.genotypes()
.expect("Error parsing genotypes")
.get(0)
.iter()
.copied()
.collect();
(writer, genotype_alleles)
} else {
panic!("No records found in BCF data");
}
}
#[test]
fn test_create_records_insertion() -> Result<(), Error> {
let alignment = vec![
(1, AlignmentType::PrimaryMatch),
(1, AlignmentType::PrimaryInsertion),
(3, AlignmentType::PrimaryMatch),
];
let reference_sequence = b"ACGT";
let query_sequence = b"AGCGT";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(0, 0),
AlignmentCoordinates::new(reference_sequence.len(), query_sequence.len()),
);
let (writer, _) = get_dummy_record_and_genotype();
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
assert_eq!(records.len(), 1);
let record = &records[0];
assert_eq!(record.0.pos(), 1000);
assert_eq!(record.0.alleles()[0], b"A");
assert_eq!(record.0.alleles()[1], b"AG");
Ok(())
}
#[test]
fn test_create_records_deletion() -> Result<(), Error> {
let alignment = vec![
(1, AlignmentType::PrimaryMatch),
(1, AlignmentType::PrimaryDeletion),
(3, AlignmentType::PrimaryMatch),
];
let reference_sequence = b"AGCGT";
let query_sequence = b"ACGT";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(0, 0),
AlignmentCoordinates::new(reference_sequence.len(), query_sequence.len()),
);
let (writer, _) = get_dummy_record_and_genotype();
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
assert_eq!(records.len(), 1);
let record = &records[0];
assert_eq!(record.0.pos(), 1000);
assert_eq!(record.0.alleles()[0], b"AG");
assert_eq!(record.0.alleles()[1], b"A");
Ok(())
}
#[test]
fn test_create_records_substitution() -> Result<(), Error> {
let alignment = vec![
(1, AlignmentType::PrimaryMatch),
(1, AlignmentType::PrimarySubstitution),
(2, AlignmentType::PrimaryMatch),
];
let reference_sequence = b"ACGT";
let query_sequence = b"AGGT";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(0, 0),
AlignmentCoordinates::new(reference_sequence.len(), query_sequence.len()),
);
let (writer, gt) = get_dummy_record_and_genotype();
let mut record = writer.empty_record();
record.push_genotypes(>)?;
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
assert_eq!(records.len(), 1);
let record = &records[0];
assert_eq!(record.0.pos(), 1001);
assert_eq!(record.0.alleles()[0], b"C");
assert_eq!(record.0.alleles()[1], b"G");
Ok(())
}
#[test]
fn test_create_records_mixed() -> Result<(), Error> {
let alignment = vec![
(1, AlignmentType::PrimaryMatch),
(1, AlignmentType::PrimaryInsertion),
(1, AlignmentType::PrimaryMatch),
(1, AlignmentType::PrimaryDeletion),
(1, AlignmentType::PrimaryMatch),
];
let reference_sequence = b"ACGT";
let query_sequence = b"AGCT";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(0, 0),
AlignmentCoordinates::new(reference_sequence.len(), query_sequence.len()),
);
let (writer, _) = get_dummy_record_and_genotype();
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
for r in &records {
println!("{}", r.0.to_vcf_string().unwrap());
}
assert_eq!(records.len(), 2);
let r1 = &records[0];
assert_eq!(r1.0.pos(), 1000);
assert_eq!(r1.0.alleles()[0], b"A");
assert_eq!(r1.0.alleles()[1], b"AG");
let r2 = &records[1];
assert_eq!(r2.0.pos(), 1001);
assert_eq!(r2.0.alleles()[0], b"CG");
assert_eq!(r2.0.alleles()[1], b"C");
Ok(())
}
#[test]
fn test_create_records_no_op() -> Result<(), Error> {
let alignment = vec![];
let reference_sequence = b"ACGT";
let query_sequence = b"ACGT";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(0, 0),
AlignmentCoordinates::new(reference_sequence.len(), query_sequence.len()),
);
let (writer, _) = get_dummy_record_and_genotype();
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
assert!(records.is_empty());
let alignment = vec![(4, AlignmentType::PrimaryMatch)];
let records = create_records(
alignment.into_iter(),
(ranges.reference_offset(), reference_sequence),
(ranges.query_offset(), query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
assert!(records.is_empty());
Ok(())
}
#[test]
fn test_create_records_for_ts() -> Result<(), Error> {
let reference_sequence = b"AAAAACGTGGGG";
let query_sequence = b"AAAAACTTTTTGGGG";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(4, 4),
AlignmentCoordinates::new(8, 11),
);
let alignment = Aligner::new().align(
"heh",
reference_sequence,
"kek",
query_sequence,
Some(ranges.clone()),
None,
None,
);
let (writer, _) = get_dummy_record_and_genotype();
let AlignmentResult::WithTarget {
alignment,
statistics,
} = alignment
else {
panic!();
};
let records = create_records(
alignment.iter_compact_cloned(),
(statistics.reference_offset, reference_sequence),
(statistics.query_offset, query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
for r in &records {
println!("{}", r.0.to_vcf_string()?);
}
assert_eq!(1, records.len());
let r = &records[0];
assert_eq!(1006, r.0.pos());
assert_eq!(b"GT", r.0.alleles()[0]);
assert_eq!(b"TTTTT", r.0.alleles()[1]);
assert_eq!(2, r.0.allele_count());
Ok(())
}
#[test]
fn test_create_records_for_ts_with_ref_repeat() -> Result<(), Error> {
let reference_sequence = b"AAAAACGTGGGG";
let query_sequence = b"AAAAACTTTTTAACGTGGGG";
let ranges = AlignmentRange::new_offset_limit(
AlignmentCoordinates::new(4, 4),
AlignmentCoordinates::new(8, 16),
);
let alignment = Aligner::new().align(
"heh",
reference_sequence,
"kek",
query_sequence,
Some(ranges.clone()),
None,
None,
);
let (writer, _) = get_dummy_record_and_genotype();
let AlignmentResult::WithTarget {
alignment,
statistics,
} = alignment
else {
panic!();
};
let records = create_records(
alignment.iter_compact_cloned(),
(statistics.reference_offset, reference_sequence),
(statistics.query_offset, query_sequence),
None,
&writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 1000),
)?;
for r in &records {
println!("{}", r.0.to_vcf_string()?);
}
assert_eq!(1, records.len());
let r = &records[0];
assert_eq!(1002, r.0.pos());
assert_eq!(b"A", r.0.alleles()[0]);
assert_eq!(b"AAACTTTTT", r.0.alleles()[1]);
assert_eq!(2, r.0.allele_count());
Ok(())
}
#[test]
fn test_candidate_to_bcf() -> Result<(), Error> {
let candidate = VariantCandidate {
r_range: 5..10,
q_range: 5..6,
align: vec![
(1, AlignmentType::PrimaryMatch),
(4, AlignmentType::PrimaryDeletion),
],
};
let (mut writer, gts) = get_dummy_record_and_genotype();
let mut record = candidate_to_bcf_record(
writer.empty_record(),
&GenomePosition::new_0(ContigName::new(b"chrZ"), 100),
b"ABCDEFGHIJKL",
b"ABCDEFKL",
candidate,
None,
)?;
record.0.push_genotypes(>s)?;
writer.write(&record.0)?;
assert_eq!(record.0.pos(), 105);
assert_eq!(record.0.alleles(), vec!["FGHIJ".as_bytes(), "F".as_bytes()]);
let cigar = record
.0
.info(strings::VCF_TS_CIGARETS_KEY.as_bytes())
.string()?
.unwrap();
assert_eq!(
vcf_decode_info_string(cigar[0].to_str()?.to_string()),
"1M4D"
);
Ok(())
}
}