lorikeet_genome/read_threading/abstract_read_threading_graph.rs
1use gkl::smithwaterman::Parameters;
2use hashlink::LinkedHashMap;
3use petgraph::stable_graph::{EdgeIndex, NodeIndex};
4use petgraph::Direction;
5use rust_htslib::bam::record::{Cigar, CigarString};
6use std::fmt::Debug;
7
8use crate::assembly::kmer::Kmer;
9use crate::graphs::base_edge::BaseEdgeStruct;
10use crate::graphs::base_graph::BaseGraph;
11use crate::graphs::multi_sample_edge::MultiSampleEdge;
12use crate::graphs::seq_graph::SeqGraph;
13use crate::read_threading::multi_debruijn_vertex::MultiDeBruijnVertex;
14use crate::read_threading::read_threading_graph::ReadThreadingGraph;
15use crate::reads::bird_tool_reads::BirdToolRead;
16use crate::utils::simple_interval::Locatable;
17
18/**
19 * Read threading graph class intended to contain duplicated code between {@link ReadThreadingGraph} and {@link JunctionTreeLinkedDeBruijnGraph}.
20 */
21
22// TODO: Change this to an enum with two options, ReadThreadGraph & JunctionTreeLinkedDeBruijnGraph
23pub trait AbstractReadThreadingGraph: Sized + Send + Sync + Debug {
24 fn get_base_graph(&self) -> &BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>;
25
26 fn get_base_graph_mut(&mut self) -> &mut BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>;
27
28 fn get_kmer_size(&self) -> usize;
29
30 fn get_reference_source_vertex(&self) -> Option<NodeIndex>;
31
32 fn get_reference_sink_vertex(&self) -> Option<NodeIndex>;
33
34 fn has_cycles(&self) -> bool;
35
36 // heuristic to decide if the graph should be thown away based on low complexity/poor assembly
37 fn is_low_quality_graph(&self) -> bool;
38
39 fn print_graph(&self, file_name: String, prune_factor: usize);
40
41 /**
42 * Checks whether a kmer can be the threading start based on the current threading start location policy.
43 *
44 * @param kmer the query kmer.
45 * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
46 * @see #setThreadingStartOnlyAtExistingVertex(boolean)
47 */
48 fn is_threading_start(
49 &self,
50 kmer: &Kmer
51 ) -> bool;
52
53 // get the next kmerVertex for ChainExtension and validate if necessary.
54 fn get_next_kmer_vertex_for_chain_extension(
55 &self,
56 kmer: &Kmer,
57 is_ref: bool,
58 prev_vertex: NodeIndex,
59 ) -> Option<&NodeIndex>;
60
61 /**
62 * Add bases in sequence to this graph
63 *
64 * @param seqName a useful seqName for this read, for debugging purposes
65 * @param sequence non-null sequence of bases
66 * @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
67 * @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
68 * @param count the representative count of this sequence (to use as the weight)
69 * @param isRef is this the reference sequence.
70 */
71 fn add_sequence<'a>(
72 &self,
73 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
74 seq_name: String,
75 sample_index: usize,
76 sequence: &'a [u8],
77 start: usize,
78 stop: usize,
79 count: usize,
80 is_ref: bool,
81 );
82
83 /**
84 * Determine whether the provided cigar is okay to merge into the reference path
85 *
86 * @param cigar the cigar to analyze
87 * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay
88 * @param requireLastElementM if true, require that the last cigar element be an M operator in order for it to be okay
89 * @return true if it's okay to merge, false otherwise
90 */
91 fn cigar_is_okay_to_merge(
92 cigar: &CigarString,
93 require_first_element_m: bool,
94 require_last_element_m: bool,
95 ) -> bool {
96 let num_elements = cigar.0.len();
97
98 // don't allow more than a couple of different ops
99 if num_elements == 0 || num_elements > ReadThreadingGraph::MAX_CIGAR_COMPLEXITY {
100 return false;
101 };
102
103 // the first element must be an M
104 if require_first_element_m {
105 match cigar.0[0] {
106 Cigar::Match(_) => {
107 // correct operator but more checks to do
108 }
109 _ => return false,
110 }
111 };
112
113 // the last element must be an M
114 if require_last_element_m {
115 match cigar.0[num_elements - 1] {
116 Cigar::Match(_) => {
117 // correct operator but more checks to do
118 }
119 _ => return false,
120 }
121 };
122
123 // note that there are checks for too many mismatches in the dangling branch later in the process
124 return true;
125 }
126
127 /**
128 * Changes the threading start location policy.
129 *
130 * @param value {@code true} if threading will start only at existing vertices in the graph, {@code false} if
131 * it can start at any unique kmer.
132 */
133 fn set_threading_start_only_at_existing_vertex(&mut self, value: bool);
134
135 /**
136 * Add a read to the sequence graph. Finds maximal consecutive runs of bases with sufficient quality
137 * and applies {@see addSequence} to these subreads if they are longer than the kmer size.
138 *
139 * @param read a non-null read
140 */
141 fn add_read<'a>(
142 &mut self,
143 read: &'a BirdToolRead,
144 sample_names: &[String],
145 count: &mut usize,
146 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
147 );
148
149 // only add the new kmer to the map if it exists and isn't in our non-unique kmer list
150 fn track_kmer(&mut self, kmer: Kmer, new_vertex: NodeIndex);
151
152 /**
153 * Determines whether a base can safely be used for assembly.
154 * Currently disallows Ns and/or those with low quality
155 *
156 * @param base the base under consideration
157 * @param qual the quality of that base
158 * @return true if the base can be used for assembly, false otherwise
159 */
160 fn base_is_usable_for_assembly(&self, base: u8, qual: u8) -> bool;
161
162 /**
163 * Since we want to duplicate non-unique kmers in the graph code we must determine what those kmers are
164 */
165 fn preprocess_reads<'a>(&mut self, pending: &LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>);
166
167 /**
168 * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have
169 * been added to the graph.
170 */
171 fn build_graph_if_necessary<'a>(
172 &mut self,
173 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
174 );
175
176 /**
177 * Thread sequence seqForKmers through the current graph, updating the graph as appropriate
178 *
179 * @param seqForKmers a non-null sequence
180 */
181 fn thread_sequence<'a>(&mut self, seq_for_kmers: &SequenceForKmers<'a>);
182
183 /**
184 * Find vertex and its position in seqForKmers where we should start assembling seqForKmers
185 *
186 * @param seqForKmers the sequence we want to thread into the graph
187 * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one
188 */
189 fn find_start<'a>(&self, seq_for_kmers: &SequenceForKmers<'a>) -> Option<usize>;
190
191 // whether reads are needed after graph construction
192 fn should_remove_reads_after_graph_construction(&self) -> bool;
193
194 /**
195 * calculates the longest suffix match between a sequence and a smaller kmer
196 *
197 * @param seq the (reference) sequence
198 * @param kmer the smaller kmer sequence
199 * @param seqStart the index (inclusive) on seq to start looking backwards from
200 * @return the longest matching suffix
201 */
202 fn longest_suffix_match(seq: &[u8], kmer: &[u8], seq_start: i64) -> usize {
203 for len in 1..=(kmer.len() as i64) {
204 let seq_i = seq_start - len + 1;
205 let kmer_i = kmer.len() as i64 - len;
206 if seq_i < 0 || seq[seq_i as usize] != kmer[kmer_i as usize] {
207 return len as usize - 1;
208 }
209 }
210
211 return kmer.len();
212 }
213
214 /**
215 * Get the vertex for the kmer in sequence starting at start
216 *
217 * @param sequence the sequence
218 * @param start the position of the kmer start
219 * @return a non-null vertex
220 */
221 fn get_or_create_kmer_vertex(&mut self, sequence: &[u8], start: usize) -> NodeIndex;
222
223 /**
224 * Try to recover dangling tails
225 *
226 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
227 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
228 * @param recoverAll recover even branches with forks
229 * @param aligner
230 */
231 fn recover_dangling_tails(
232 &mut self,
233 prune_factor: usize,
234 min_dangling_branch_length: i32,
235 recover_all: bool,
236 dangling_end_sw_parameters: &Parameters,
237 );
238
239 /**
240 * Try to recover dangling heads
241 *
242 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
243 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
244 * @param recoverAll recover even branches with forks
245 * @param aligner
246 */
247 fn recover_dangling_heads(
248 &mut self,
249 prune_factor: usize,
250 min_dangling_branch_length: i32,
251 recover_all: bool,
252 dangling_end_sw_parameters: &Parameters,
253 );
254
255 /**
256 * Attempt to attach vertex with out-degree == 0 to the graph
257 *
258 * @param vertex the vertex to recover
259 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
260 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
261 * @param aligner
262 * @return 1 if we successfully recovered the vertex and 0 otherwise
263 */
264 fn recover_dangling_tail(
265 &mut self,
266 vertex: NodeIndex,
267 prune_factor: usize,
268 min_dangling_branch_length: i32,
269 recover_all: bool,
270 dangling_tail_sw_parameters: &Parameters,
271 ) -> usize;
272
273 /**
274 * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph
275 *
276 * @param vertex the vertex to recover
277 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
278 * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
279 * @param recoverAll recover even branches with forks
280 * @param aligner
281 * @return 1 if we successfully recovered a vertex and 0 otherwise
282 */
283 fn recover_dangling_head(
284 &mut self,
285 vertex: NodeIndex,
286 prune_factor: usize,
287 min_dangling_branch_length: i32,
288 recover_all: bool,
289 dangling_head_sw_parameters: &Parameters,
290 ) -> usize;
291
292 /**
293 * Actually merge the dangling tail if possible
294 *
295 * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
296 * @return 1 if merge was successful, 0 otherwise
297 */
298 fn merge_dangling_tail(
299 &mut self,
300 dangling_tail_merge_result: DanglingChainMergeHelper,
301 ) -> usize;
302
303 /**
304 * Actually merge the dangling head if possible, this is the old codepath that does not handle indels
305 *
306 * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
307 * @return 1 if merge was successful, 0 otherwise
308 */
309 fn merge_dangling_head_legacy(
310 &mut self,
311 dangling_head_merge_result: DanglingChainMergeHelper,
312 ) -> usize;
313
314 /**
315 * Actually merge the dangling head if possible
316 *
317 * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
318 * @return 1 if merge was successful, 0 otherwise
319 */
320 fn merge_dangling_head(
321 &mut self,
322 dangling_head_merge_result: DanglingChainMergeHelper,
323 ) -> usize;
324
325 /**
326 * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
327 * Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read
328 * at the end in order to emit an alignment offset.
329 *
330 * @param cigarElements cigar elements corresponding to the alignment between path1 and path2
331 * @param path1 the first path
332 * @param path2 the second path
333 * @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found)
334 */
335 fn best_prefix_match(
336 &self,
337 cigar_elements: &Vec<Cigar>,
338 path1: &[u8],
339 path2: &[u8],
340 ) -> (i64, i64);
341
342 /**
343 * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
344 * Assumes that path1.length >= maxIndex and path2.length >= maxIndex.
345 *
346 * @param path1 the first path
347 * @param path2 the second path
348 * @param maxIndex the maximum index to traverse (not inclusive)
349 * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex
350 */
351 fn best_prefix_match_legacy(
352 &self,
353 path1: &[u8],
354 path2: &[u8],
355 max_index: usize,
356 ) -> Option<usize>;
357
358 /**
359 * NOTE: this method is only used for dangling heads and not tails.
360 *
361 * Determine the maximum number of mismatches permitted on the branch.
362 * Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size.
363 *
364 * @param lengthOfDanglingBranch the length of the branch itself
365 * @return positive integer
366 */
367 fn get_max_mismatches_legacy(&self, length_of_dangling_branch: usize) -> usize;
368
369 /**
370 * The minimum number of matches to be considered allowable for recovering dangling ends
371 */
372 fn get_min_matching_bases(&self) -> i32;
373
374 /**
375 * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
376 * provided vertex is the sink) and the reference path.
377 *
378 * @param aligner
379 * @param vertex the sink of the dangling chain
380 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
381 * @param recoverAll recover even branches with forks
382 * @return a SmithWaterman object which can be null if no proper alignment could be generated
383 */
384 fn generate_cigar_against_downwards_reference_path(
385 &self,
386 vertex: NodeIndex,
387 prune_factor: usize,
388 min_dangling_branch_length: i32,
389 recover_all: bool,
390 dangling_tail_sw_parameters: &Parameters,
391 ) -> Option<DanglingChainMergeHelper>;
392
393 /**
394 * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
395 * provided vertex is the source) and the reference path.
396 *
397 * @param aligner
398 * @param vertex the source of the dangling head
399 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
400 * @param recoverAll recover even branches with forks
401 * @return a SmithWaterman object which can be null if no proper alignment could be generated
402 */
403 fn generate_cigar_against_upwards_reference_path(
404 &self,
405 vertex: NodeIndex,
406 prune_factor: usize,
407 min_dangling_branch_length: i32,
408 recover_all: bool,
409 dangling_head_sw_parameters: &Parameters,
410 ) -> Option<DanglingChainMergeHelper>;
411
412 /**
413 * Finds the path upwards in the graph from this vertex to the first diverging node, including that (lowest common ancestor) vertex.
414 * Note that nodes are excluded if their pruning weight is less than the pruning factor.
415 *
416 * @param vertex the original vertex
417 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
418 * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
419 * @return the path if it can be determined or null if this vertex either doesn't merge onto another path or
420 * has an ancestor with multiple incoming edges before hitting the reference path
421 */
422 fn find_path_upwards_to_lowest_common_ancestor(
423 &self,
424 vertex: NodeIndex,
425 prune_factor: usize,
426 give_up_at_branch: bool,
427 ) -> Option<Vec<NodeIndex>>;
428
429 /**
430 * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex.
431 * However note that the path is reversed so that this vertex ends up at the end of the path.
432 * Also note that nodes are excluded if their pruning weight is less than the pruning factor.
433 *
434 * @param vertex the original vertex
435 * @param pruneFactor the prune factor to use in ignoring chain pieces
436 * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
437 * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
438 * has a descendant with multiple outgoing edges before hitting the reference path
439 */
440 fn find_path_downwards_to_highest_common_desendant_of_reference(
441 &self,
442 vertex: NodeIndex,
443 prune_factor: usize,
444 give_up_at_branch: bool,
445 ) -> Option<Vec<NodeIndex>>;
446
447 /**
448 * Finds the path in the graph from this vertex to the reference sink, including this vertex
449 *
450 * @param start the reference vertex to start from
451 * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source)
452 * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
453 * @return the path (non-null, non-empty)
454 */
455 fn get_reference_path(
456 &self,
457 start: NodeIndex,
458 direction: TraversalDirection,
459 blacklisted_edge: Option<EdgeIndex>,
460 ) -> Vec<NodeIndex>;
461
462 /**
463 * Get the unique vertex for kmer, or null if not possible.
464 *
465 * @param allowRefSource if true, we will allow kmer to match the reference source vertex
466 * @return a vertex for kmer, or null (either because it doesn't exist or is non-unique for graphs that have such a distinction)
467 */
468 fn get_kmer_vertex(&self, kmer: &Kmer, allow_ref_source: bool) -> Option<&NodeIndex>;
469
470 /**
471 * Create a new vertex for kmer. Add it to the kmerToVertexMap map if appropriate.
472 *
473 * @param kmer the kmer we want to create a vertex for
474 * @return the non-null created vertex
475 */
476 fn create_vertex(&mut self, sequence: &[u8], kmer: Kmer) -> NodeIndex;
477
478 fn increase_counts_in_matched_kmers(
479 &mut self,
480 seqs_for_kmers: &SequenceForKmers,
481 vertex: NodeIndex,
482 original_kmer: &[u8],
483 offset: Option<usize>,
484 );
485
486 /**
487 * Workhorse routine of the assembler. Given a sequence whose last vertex is anchored in the graph, extend
488 * the graph one bp according to the bases in sequence.
489 *
490 * @param prevVertex a non-null vertex where sequence was last anchored in the graph
491 * @param sequence the sequence we're threading through the graph
492 * @param kmerStart the start of the current kmer in graph we'd like to add
493 * @param count the number of observations of this kmer in graph (can be > 1 for GGA)
494 * @param isRef is this the reference sequence?
495 * @return a non-null vertex connecting prevVertex to in the graph based on sequence
496 */
497 fn extend_chain_by_one(
498 &mut self,
499 prev_vertex: NodeIndex,
500 sequence: &[u8],
501 kmer_start: usize,
502 count: usize,
503 is_ref: bool,
504 ) -> NodeIndex;
505
506 /**
507 * Finds a path starting from a given vertex and satisfying various predicates
508 *
509 * @param vertex the original vertex
510 * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
511 * @param done test for whether a vertex is at the end of the path
512 * @param returnPath test for whether to return a found path based on its terminal vertex
513 * @param nextEdge function on vertices returning the next edge in the path
514 * @param nextNode function of edges returning the next vertex in the path
515 * @return a path, if one satisfying all predicates is found, {@code null} otherwise
516 */
517 fn find_path(
518 &self,
519 vertex: NodeIndex,
520 prune_factor: usize,
521 done: &dyn Fn(NodeIndex) -> bool,
522 return_path: &dyn Fn(NodeIndex) -> bool,
523 next_edge: &dyn Fn(NodeIndex) -> EdgeIndex,
524 next_node: &dyn Fn(EdgeIndex) -> NodeIndex,
525 ) -> Option<Vec<NodeIndex>>;
526
527 fn has_incident_ref_edge(&self, v: NodeIndex) -> bool;
528
529 fn get_heaviest_edge(&self, v: NodeIndex, direction: Direction) -> EdgeIndex;
530
531 /**
532 * The base sequence for the given path.
533 *
534 * @param path the list of vertexes that make up the path
535 * @param expandSource if true and if we encounter a source node, then expand (and reverse) the character sequence for that node
536 * @return non-null sequence of bases corresponding to the given path
537 */
538 fn get_bases_for_path(&self, path: &Vec<NodeIndex>, expand_source: bool) -> String;
539
540 fn extend_dangling_path_against_reference(
541 &mut self,
542 dangling_head_merge_result: &mut DanglingChainMergeHelper,
543 num_nodes_to_extend: usize,
544 ) -> bool;
545
546 fn remove_paths_not_connected_to_ref(&mut self);
547
548 fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct>;
549
550 // Method that will be called immediately before haplotype finding in the event there are
551 // alteations that must be made to the graph based on implementation
552 fn post_process_for_haplotype_finding<L: Locatable>(
553 &mut self,
554 debug_graph_output_path: Option<&String>,
555 ref_haplotype: &L,
556 );
557
558 fn find_kmer(&self, kmer: &Kmer) -> Option<&NodeIndex>;
559
560 fn set_increase_counts_through_branches(&mut self, increase_counts_through_branches: bool);
561
562 fn set_min_matching_bases_to_dangling_end_recovery(&mut self, min_matching_bases: i32);
563}
564
565pub enum TraversalDirection {
566 Downwards,
567 Upwards,
568}
569
570/**
571 * Keeps track of the information needed to add a sequence to the read threading assembly graph
572 */
573#[derive(Debug, Clone)]
574pub struct SequenceForKmers<'a> {
575 pub name: String,
576 pub sequence: &'a [u8],
577 pub start: usize,
578 pub stop: usize,
579 pub count: usize,
580 pub is_ref: bool,
581}
582
583impl<'a> SequenceForKmers<'a> {
584 /**
585 * Create a new sequence for creating kmers
586 */
587 pub fn new(
588 name: String,
589 sequence: &'a [u8],
590 start: usize,
591 stop: usize,
592 count: usize,
593 is_ref: bool,
594 ) -> SequenceForKmers {
595 SequenceForKmers {
596 name,
597 sequence,
598 start,
599 stop,
600 count,
601 is_ref,
602 }
603 }
604}
605
606/**
607 * Class to keep track of the important dangling chain merging data
608 */
609pub struct DanglingChainMergeHelper {
610 pub dangling_path: Vec<NodeIndex>,
611 pub reference_path: Vec<NodeIndex>,
612 pub dangling_path_string: String,
613 pub reference_path_string: String,
614 pub cigar: CigarString,
615}
616
617impl DanglingChainMergeHelper {
618 pub fn new(
619 dangling_path: Vec<NodeIndex>,
620 reference_path: Vec<NodeIndex>,
621 dangling_path_string: String,
622 reference_path_string: String,
623 cigar: CigarString,
624 ) -> DanglingChainMergeHelper {
625 DanglingChainMergeHelper {
626 dangling_path,
627 reference_path,
628 dangling_path_string,
629 reference_path_string,
630 cigar,
631 }
632 }
633}