1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
use crate::data_types::phase_enums::Allele;
use crate::data_types::variants::Variant;
use crate::dwfa::dynamic_wfa::{DWFAError, DWFALite};
use crate::util::sequence_alignment::edit_distance;
#[derive(thiserror::Error, Debug)]
pub enum HapDWFAError {
#[error("error from DWFA: {error}")]
DError { error: DWFAError },
#[error("cannot extend Allele::Unknown")]
UnknownAllele,
}
/// Haplotype-specific details. This node does not track the comparators for efficiency, so they must be provided with each call.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct HaplotypeDWFA {
/// Tracks the constructed truth haplotype
truth_haplotype: HaplotypeTracker,
/// Tracks the constructed query haplotype
query_haplotype: HaplotypeTracker,
/// The DWFA that tracks our edit distance to the baseline/comparator sequence
dwfa: DWFALite
}
impl HaplotypeDWFA {
/// Constructor
/// # Arguments
/// * `region_start` - reference start of this DWFA
pub fn new(region_start: usize, max_edit_distance: usize) -> Self {
Self {
truth_haplotype: HaplotypeTracker::new(region_start),
query_haplotype: HaplotypeTracker::new(region_start),
// no wildcards, and we want end-to-end without early termination
dwfa: DWFALite::with_max_edit_distance(max_edit_distance)
}
}
/// Adds variants sequence to both haplotypes. Returns true if any ALT components were successfully incorporated.
/// # Arguments
/// * `reference` - the full reference sequence
/// * `is_truth` - if true, add the variant to the truth sequence; otherwise, to the query sequence
/// * `variant` - the variant we are traversing
/// * `allele` - the REF/ALT indication
/// * `sync_extension` - if provided, this will further copy the reference sequence to both truth and query
pub fn extend_variant(&mut self, reference: &[u8], is_truth: bool, variant: &Variant, allele: Allele, sync_extension: Option<usize>) -> Result<bool, HapDWFAError> {
// extend the appropriate haplotype
let success = if is_truth {
// extend our query by the reference up to the sync point, passing the sync point also
if let Some(sync_pos) = sync_extension {
self.query_haplotype.copy_reference(reference, sync_pos)?;
}
// now add the variant to truth
self.truth_haplotype.extend_variant(reference, variant, allele, sync_extension)?
} else {
// extend our truth by the reference up to the sync point
if let Some(sync_pos) = sync_extension {
self.truth_haplotype.copy_reference(reference, sync_pos)?;
}
// now add the variant to query, passing the sync point also
self.query_haplotype.extend_variant(reference, variant, allele, sync_extension)?
};
// now update the DWFA
self.update_dwfa()?;
Ok(success)
}
/// Updates the contained DWFA after a change to the internally tracked sequence.
/// # Arguments
/// * `baseline` - the baseline sequence we compare against
fn update_dwfa(&mut self) -> Result<(), HapDWFAError> {
// update each DWFA, translate errors to anyhow
match self.dwfa.update(self.truth_haplotype.sequence(), self.query_haplotype.sequence()) {
Ok(_) => Ok(()),
Err(error) => Err(HapDWFAError::DError { error })
}
}
/// Fills out the rest of the region with reference sequence and then finalizes the DWFA.
/// # Arguments
/// * `reference` - the full reference sequence
/// * `region_end` - the end position of the region
pub fn finalize_dwfa(&mut self, reference: &[u8], region_end: usize) -> Result<(), HapDWFAError> {
// add any remaining reference sequence
self.truth_haplotype.copy_reference(reference, region_end)?;
self.query_haplotype.copy_reference(reference, region_end)?;
self.update_dwfa()?;
// finalize the DWFA, translate errors to anyhow
match self.dwfa.finalize(self.truth_haplotype.sequence(), self.query_haplotype.sequence()) {
Ok(_) => Ok(()),
Err(error) => Err(HapDWFAError::DError { error })
}
}
/// Returns true if the sequences are identical AND they are waiting at the same reference position.
/// In this situation, they are "synchronized", in the sense that all following changes are independent of the previous changes.
pub fn is_synchronized(&self) -> bool {
/*
Key insight comes from https://www.biorxiv.org/content/10.1101/023754v2.full.pdf
Page 3, part (c) of the reducing paths explanation, paraphrased:
- IF the two paths so far are completely identical (same length also, so not just compatible)
- AND they are at the same "reference" position
- THEN any changes happening after this point are 100% independent of what has happened up to this point
In other words, we *know* this node is on the best path, so all other nodes can be pruned from the search space.
*/
self.edit_distance() == 0 &&
self.truth_haplotype.sequence().len() == self.query_haplotype.sequence().len() &&
self.truth_haplotype.ref_pos() == self.query_haplotype.ref_pos()
}
/// Edit distance from the underlying DWFA.
pub fn edit_distance(&self) -> usize {
self.dwfa.edit_distance()
}
/// Returns the total number of set alleles across both truth and query
pub fn set_alleles(&self) -> usize {
self.truth_haplotype.alleles().len() + self.query_haplotype.alleles().len()
}
/// Returns the total skip distance of truth + query
pub fn total_variant_skip_distance(&self) -> usize {
self.truth_haplotype.variant_skip_distance() + self.query_haplotype.variant_skip_distance()
}
/// Returns the total cost of this comparison, which includes both the edit distance from the DWFA and also the cost of ignored, incompatible variants.
pub fn total_cost(&self) -> usize {
self.edit_distance() + self.total_variant_skip_distance()
}
// getters
pub fn truth_haplotype(&self) -> &HaplotypeTracker {
&self.truth_haplotype
}
pub fn query_haplotype(&self) -> &HaplotypeTracker {
&self.query_haplotype
}
}
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct HaplotypeTracker {
/// Current position in the reference sequence
ref_pos: usize,
/// The alleles associated with this node
alleles: Vec<Allele>,
/// The generated haplotype sequence so far
sequence: Vec<u8>,
/// The total ED of skipped variants
variant_skip_distance: usize
}
impl HaplotypeTracker {
/// Constructor
/// # Arguments
/// * `region_start` - tracks the current reference position
pub fn new(region_start: usize) -> Self {
Self {
ref_pos: region_start,
alleles: vec![],
sequence: vec![],
variant_skip_distance: 0
}
}
/// Adds variants sequence to both haplotypes. Returns true if any ALT components were successfully incorporated.
/// # Arguments
/// * `reference` - the full reference sequence
/// * `variant` - the variant we are traversing
/// * `allele` - the REF/ALT indication
/// * `ref_extension` - if provided, this will further copy the reference sequence after the allele is added through the provided position
pub fn extend_variant(&mut self, reference: &[u8], variant: &Variant, allele: Allele, ref_extension: Option<usize>) -> Result<bool, HapDWFAError> {
// no matter what, we can always add reference sequence up to the variant start
let variant_start = variant.position() as usize;
self.copy_reference(reference, variant_start)?;
// extend hap1 by the corresponding allele
let success = match allele {
Allele::Unknown => return Err(HapDWFAError::UnknownAllele),
Allele::Reference => {
// reference allele, so nothing to add
true
},
Allele::Alternate => {
// check for compatible variants
if self.ref_pos <= variant_start {
// now add the variant sequence
let seq = variant.allele1();
self.sequence.extend_from_slice(seq);
// update our ref pos for this haplotype
self.ref_pos = variant_start + variant.ref_len();
true
} else {
// incompatible, so just ignore it; but we need to add the ED as incompatible cost
self.variant_skip_distance += edit_distance(variant.allele0(), variant.allele1());
false
}
}
};
self.alleles.push(allele);
// if provided with more extension power, then do it
if let Some(p) = ref_extension {
self.copy_reference(reference, p)?;
}
Ok(success)
}
/// Copies reference sequence up to `region_end`.
/// # Arguments
/// * `reference` - the full reference sequence
/// * `region_end` - the end position of the region
pub fn copy_reference(&mut self, reference: &[u8], region_end: usize) -> Result<(), HapDWFAError> {
// add any remaining reference sequence
if self.ref_pos < region_end {
// first update the reference sequence to this point
let reference_update = &reference[self.ref_pos..region_end];
self.sequence.extend_from_slice(reference_update);
self.ref_pos = region_end;
}
Ok(())
}
// getters
pub fn ref_pos(&self) -> usize {
self.ref_pos
}
pub fn alleles(&self) -> &[Allele] {
&self.alleles
}
pub fn sequence(&self) -> &[u8] {
&self.sequence
}
pub fn variant_skip_distance(&self) -> usize {
self.variant_skip_distance
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_haplotype_node() {
// simple test just to make sure the extension works at the haplotype level
let reference = b"ACGTACGTACGT";
let region_start = 3;
let region_end = 11;
// add the variant to both truth and query
let variant = Variant::new_snv(0, 6, b"G".to_vec(), b"A".to_vec()).unwrap();
let mut hap_node = HaplotypeDWFA::new(region_start, usize::MAX); // start with first 'T'
hap_node.extend_variant(
reference, true,
&variant, Allele::Alternate, None
).unwrap();
hap_node.extend_variant(
reference, false,
&variant, Allele::Alternate, None
).unwrap();
// add an extra variant that is a FP and generates ED
let variant2 = Variant::new_snv(0, 7, b"T".to_vec(), b"A".to_vec()).unwrap();
hap_node.extend_variant(
reference, false,
&variant2, Allele::Alternate, None
).unwrap();
assert_eq!(hap_node.edit_distance(), 0); // ED won't get generated just yet
// add a reference variant, so no change
let variant3 = Variant::new_insertion(0, 8, b"A".to_vec(), b"TTTTTTTT".to_vec()).unwrap();
hap_node.extend_variant(
reference, false,
&variant3, Allele::Reference, None
).unwrap();
assert_eq!(hap_node.edit_distance(), 0);
// finalize it, cutting off the last 'T'
hap_node.finalize_dwfa(reference, region_end).unwrap();
// check all these values
assert_eq!(hap_node.edit_distance(), 1); // one FP variant that gets detected on finalizing
assert_eq!(hap_node.truth_haplotype().alleles(), &[Allele::Alternate]);
assert_eq!(hap_node.query_haplotype().alleles(), &[Allele::Alternate, Allele::Alternate, Allele::Reference]);
assert_eq!(hap_node.truth_haplotype().sequence(), b"TACATACG"); // G->A
assert_eq!(hap_node.query_haplotype().sequence(), b"TACAAACG"); // G->A and FP T->A
}
#[test]
fn test_haplotype_node_incompatible() {
// simple test just to make sure the extension works at the haplotype level
let reference = b"ACGTACGTACGT";
let region_start = 0;
let region_end = reference.len();
// add the variant to both truth and query
let variant = Variant::new_snv(0, 6, b"G".to_vec(), b"A".to_vec()).unwrap();
let mut hap_node = HaplotypeDWFA::new(region_start, usize::MAX); // start with first 'T'
hap_node.extend_variant(
reference, true,
&variant, Allele::Alternate, None
).unwrap();
hap_node.extend_variant(
reference, false,
&variant, Allele::Alternate, None
).unwrap();
// add an extra variant that is a FP and incompatible
let variant2 = Variant::new_snv(0, 6, b"G".to_vec(), b"T".to_vec()).unwrap();
hap_node.extend_variant(
reference, false,
&variant2, Allele::Alternate, None
).unwrap();
assert_eq!(hap_node.edit_distance(), 0); // ED won't get generated
assert_eq!(hap_node.total_variant_skip_distance(), 1); // but VD does
// finalize it, cutting off the last 'T'
hap_node.finalize_dwfa(reference, region_end).unwrap();
// check all these values
assert_eq!(hap_node.edit_distance(), 0); // no ED
assert_eq!(hap_node.total_variant_skip_distance(), 1); // one VD
assert_eq!(hap_node.total_cost(), 1); // 0 + 1
}
}