lorikeet_genome/read_threading/
abstract_read_threading_graph.rs

1use gkl::smithwaterman::Parameters;
2use hashlink::LinkedHashMap;
3use petgraph::stable_graph::{EdgeIndex, NodeIndex};
4use petgraph::Direction;
5use rust_htslib::bam::record::{Cigar, CigarString};
6use std::fmt::Debug;
7
8use crate::assembly::kmer::Kmer;
9use crate::graphs::base_edge::BaseEdgeStruct;
10use crate::graphs::base_graph::BaseGraph;
11use crate::graphs::multi_sample_edge::MultiSampleEdge;
12use crate::graphs::seq_graph::SeqGraph;
13use crate::read_threading::multi_debruijn_vertex::MultiDeBruijnVertex;
14use crate::read_threading::read_threading_graph::ReadThreadingGraph;
15use crate::reads::bird_tool_reads::BirdToolRead;
16use crate::utils::simple_interval::Locatable;
17
18/**
19 * Read threading graph class intended to contain duplicated code between {@link ReadThreadingGraph} and {@link JunctionTreeLinkedDeBruijnGraph}.
20 */
21
22// TODO: Change this to an enum with two options, ReadThreadGraph & JunctionTreeLinkedDeBruijnGraph
23pub trait AbstractReadThreadingGraph: Sized + Send + Sync + Debug {
24    fn get_base_graph(&self) -> &BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>;
25
26    fn get_base_graph_mut(&mut self) -> &mut BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>;
27
28    fn get_kmer_size(&self) -> usize;
29
30    fn get_reference_source_vertex(&self) -> Option<NodeIndex>;
31
32    fn get_reference_sink_vertex(&self) -> Option<NodeIndex>;
33
34    fn has_cycles(&self) -> bool;
35
36    // heuristic to decide if the graph should be thown away based on low complexity/poor assembly
37    fn is_low_quality_graph(&self) -> bool;
38
39    fn print_graph(&self, file_name: String, prune_factor: usize);
40
41    /**
42     * Checks whether a kmer can be the threading start based on the current threading start location policy.
43     *
44     * @param kmer the query kmer.
45     * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
46     * @see #setThreadingStartOnlyAtExistingVertex(boolean)
47     */
48    fn is_threading_start(
49        &self,
50        kmer: &Kmer
51    ) -> bool;
52
53    // get the next kmerVertex for ChainExtension and validate if necessary.
54    fn get_next_kmer_vertex_for_chain_extension(
55        &self,
56        kmer: &Kmer,
57        is_ref: bool,
58        prev_vertex: NodeIndex,
59    ) -> Option<&NodeIndex>;
60
61    /**
62     * Add bases in sequence to this graph
63     *
64     * @param seqName  a useful seqName for this read, for debugging purposes
65     * @param sequence non-null sequence of bases
66     * @param start    the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
67     * @param stop     the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
68     * @param count    the representative count of this sequence (to use as the weight)
69     * @param isRef    is this the reference sequence.
70     */
71    fn add_sequence<'a>(
72        &self,
73        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
74        seq_name: String,
75        sample_index: usize,
76        sequence: &'a [u8],
77        start: usize,
78        stop: usize,
79        count: usize,
80        is_ref: bool,
81    );
82
83    /**
84     * Determine whether the provided cigar is okay to merge into the reference path
85     *
86     * @param cigar                the cigar to analyze
87     * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay
88     * @param requireLastElementM  if true, require that the last cigar element be an M operator in order for it to be okay
89     * @return true if it's okay to merge, false otherwise
90     */
91    fn cigar_is_okay_to_merge(
92        cigar: &CigarString,
93        require_first_element_m: bool,
94        require_last_element_m: bool,
95    ) -> bool {
96        let num_elements = cigar.0.len();
97
98        // don't allow more than a couple of different ops
99        if num_elements == 0 || num_elements > ReadThreadingGraph::MAX_CIGAR_COMPLEXITY {
100            return false;
101        };
102
103        // the first element must be an M
104        if require_first_element_m {
105            match cigar.0[0] {
106                Cigar::Match(_) => {
107                    // correct operator but more checks to do
108                }
109                _ => return false,
110            }
111        };
112
113        // the last element must be an M
114        if require_last_element_m {
115            match cigar.0[num_elements - 1] {
116                Cigar::Match(_) => {
117                    // correct operator but more checks to do
118                }
119                _ => return false,
120            }
121        };
122
123        // note that there are checks for too many mismatches in the dangling branch later in the process
124        return true;
125    }
126
127    /**
128     * Changes the threading start location policy.
129     *
130     * @param value {@code true} if threading will start only at existing vertices in the graph, {@code false} if
131     *              it can start at any unique kmer.
132     */
133    fn set_threading_start_only_at_existing_vertex(&mut self, value: bool);
134
135    /**
136     * Add a read to the sequence graph.  Finds maximal consecutive runs of bases with sufficient quality
137     * and applies {@see addSequence} to these subreads if they are longer than the kmer size.
138     *
139     * @param read a non-null read
140     */
141    fn add_read<'a>(
142        &mut self,
143        read: &'a BirdToolRead,
144        sample_names: &[String],
145        count: &mut usize,
146        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
147    );
148
149    // only add the new kmer to the map if it exists and isn't in our non-unique kmer list
150    fn track_kmer(&mut self, kmer: Kmer, new_vertex: NodeIndex);
151
152    /**
153     * Determines whether a base can safely be used for assembly.
154     * Currently disallows Ns and/or those with low quality
155     *
156     * @param base  the base under consideration
157     * @param qual  the quality of that base
158     * @return true if the base can be used for assembly, false otherwise
159     */
160    fn base_is_usable_for_assembly(&self, base: u8, qual: u8) -> bool;
161
162    /**
163     * Since we want to duplicate non-unique kmers in the graph code we must determine what those kmers are
164     */
165    fn preprocess_reads<'a>(&mut self, pending: &LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>);
166
167    /**
168     * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have
169     * been added to the graph.
170     */
171    fn build_graph_if_necessary<'a>(
172        &mut self,
173        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
174    );
175
176    /**
177     * Thread sequence seqForKmers through the current graph, updating the graph as appropriate
178     *
179     * @param seqForKmers a non-null sequence
180     */
181    fn thread_sequence<'a>(&mut self, seq_for_kmers: &SequenceForKmers<'a>);
182
183    /**
184     * Find vertex and its position in seqForKmers where we should start assembling seqForKmers
185     *
186     * @param seqForKmers the sequence we want to thread into the graph
187     * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one
188     */
189    fn find_start<'a>(&self, seq_for_kmers: &SequenceForKmers<'a>) -> Option<usize>;
190
191    // whether reads are needed after graph construction
192    fn should_remove_reads_after_graph_construction(&self) -> bool;
193
194    /**
195     * calculates the longest suffix match between a sequence and a smaller kmer
196     *
197     * @param seq      the (reference) sequence
198     * @param kmer     the smaller kmer sequence
199     * @param seqStart the index (inclusive) on seq to start looking backwards from
200     * @return the longest matching suffix
201     */
202    fn longest_suffix_match(seq: &[u8], kmer: &[u8], seq_start: i64) -> usize {
203        for len in 1..=(kmer.len() as i64) {
204            let seq_i = seq_start - len + 1;
205            let kmer_i = kmer.len() as i64 - len;
206            if seq_i < 0 || seq[seq_i as usize] != kmer[kmer_i as usize] {
207                return len as usize - 1;
208            }
209        }
210
211        return kmer.len();
212    }
213
214    /**
215     * Get the vertex for the kmer in sequence starting at start
216     *
217     * @param sequence the sequence
218     * @param start    the position of the kmer start
219     * @return a non-null vertex
220     */
221    fn get_or_create_kmer_vertex(&mut self, sequence: &[u8], start: usize) -> NodeIndex;
222
223    /**
224     * Try to recover dangling tails
225     *
226     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
227     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
228     * @param recoverAll              recover even branches with forks
229     * @param aligner
230     */
231    fn recover_dangling_tails(
232        &mut self,
233        prune_factor: usize,
234        min_dangling_branch_length: i32,
235        recover_all: bool,
236        dangling_end_sw_parameters: &Parameters,
237    );
238
239    /**
240     * Try to recover dangling heads
241     *
242     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
243     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
244     * @param recoverAll              recover even branches with forks
245     * @param aligner
246     */
247    fn recover_dangling_heads(
248        &mut self,
249        prune_factor: usize,
250        min_dangling_branch_length: i32,
251        recover_all: bool,
252        dangling_end_sw_parameters: &Parameters,
253    );
254
255    /**
256     * Attempt to attach vertex with out-degree == 0 to the graph
257     *
258     * @param vertex                  the vertex to recover
259     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
260     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
261     * @param aligner
262     * @return 1 if we successfully recovered the vertex and 0 otherwise
263     */
264    fn recover_dangling_tail(
265        &mut self,
266        vertex: NodeIndex,
267        prune_factor: usize,
268        min_dangling_branch_length: i32,
269        recover_all: bool,
270        dangling_tail_sw_parameters: &Parameters,
271    ) -> usize;
272
273    /**
274     * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph
275     *
276     * @param vertex                  the vertex to recover
277     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
278     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
279     * @param recoverAll              recover even branches with forks
280     * @param aligner
281     * @return 1 if we successfully recovered a vertex and 0 otherwise
282     */
283    fn recover_dangling_head(
284        &mut self,
285        vertex: NodeIndex,
286        prune_factor: usize,
287        min_dangling_branch_length: i32,
288        recover_all: bool,
289        dangling_head_sw_parameters: &Parameters,
290    ) -> usize;
291
292    /**
293     * Actually merge the dangling tail if possible
294     *
295     * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
296     * @return 1 if merge was successful, 0 otherwise
297     */
298    fn merge_dangling_tail(
299        &mut self,
300        dangling_tail_merge_result: DanglingChainMergeHelper,
301    ) -> usize;
302
303    /**
304     * Actually merge the dangling head if possible, this is the old codepath that does not handle indels
305     *
306     * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
307     * @return 1 if merge was successful, 0 otherwise
308     */
309    fn merge_dangling_head_legacy(
310        &mut self,
311        dangling_head_merge_result: DanglingChainMergeHelper,
312    ) -> usize;
313
314    /**
315     * Actually merge the dangling head if possible
316     *
317     * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
318     * @return 1 if merge was successful, 0 otherwise
319     */
320    fn merge_dangling_head(
321        &mut self,
322        dangling_head_merge_result: DanglingChainMergeHelper,
323    ) -> usize;
324
325    /**
326     * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
327     * Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read
328     * at the end in order to emit an alignment offset.
329     *
330     * @param cigarElements cigar elements corresponding to the alignment between path1 and path2
331     * @param path1  the first path
332     * @param path2  the second path
333     * @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found)
334     */
335    fn best_prefix_match(
336        &self,
337        cigar_elements: &Vec<Cigar>,
338        path1: &[u8],
339        path2: &[u8],
340    ) -> (i64, i64);
341
342    /**
343     * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
344     * Assumes that path1.length >= maxIndex and path2.length >= maxIndex.
345     *
346     * @param path1    the first path
347     * @param path2    the second path
348     * @param maxIndex the maximum index to traverse (not inclusive)
349     * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex
350     */
351    fn best_prefix_match_legacy(
352        &self,
353        path1: &[u8],
354        path2: &[u8],
355        max_index: usize,
356    ) -> Option<usize>;
357
358    /**
359     * NOTE: this method is only used for dangling heads and not tails.
360     *
361     * Determine the maximum number of mismatches permitted on the branch.
362     * Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size.
363     *
364     * @param lengthOfDanglingBranch the length of the branch itself
365     * @return positive integer
366     */
367    fn get_max_mismatches_legacy(&self, length_of_dangling_branch: usize) -> usize;
368
369    /**
370     * The minimum number of matches to be considered allowable for recovering dangling ends
371     */
372    fn get_min_matching_bases(&self) -> i32;
373
374    /**
375     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
376     * provided vertex is the sink) and the reference path.
377     *
378     * @param aligner
379     * @param vertex      the sink of the dangling chain
380     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
381     * @param recoverAll  recover even branches with forks
382     * @return a SmithWaterman object which can be null if no proper alignment could be generated
383     */
384    fn generate_cigar_against_downwards_reference_path(
385        &self,
386        vertex: NodeIndex,
387        prune_factor: usize,
388        min_dangling_branch_length: i32,
389        recover_all: bool,
390        dangling_tail_sw_parameters: &Parameters,
391    ) -> Option<DanglingChainMergeHelper>;
392
393    /**
394     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
395     * provided vertex is the source) and the reference path.
396     *
397     * @param aligner
398     * @param vertex      the source of the dangling head
399     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
400     * @param recoverAll  recover even branches with forks
401     * @return a SmithWaterman object which can be null if no proper alignment could be generated
402     */
403    fn generate_cigar_against_upwards_reference_path(
404        &self,
405        vertex: NodeIndex,
406        prune_factor: usize,
407        min_dangling_branch_length: i32,
408        recover_all: bool,
409        dangling_head_sw_parameters: &Parameters,
410    ) -> Option<DanglingChainMergeHelper>;
411
412    /**
413     * Finds the path upwards in the graph from this vertex to the first diverging node, including that (lowest common ancestor) vertex.
414     * Note that nodes are excluded if their pruning weight is less than the pruning factor.
415     *
416     * @param vertex         the original vertex
417     * @param pruneFactor    the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
418     * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
419     * @return the path if it can be determined or null if this vertex either doesn't merge onto another path or
420     * has an ancestor with multiple incoming edges before hitting the reference path
421     */
422    fn find_path_upwards_to_lowest_common_ancestor(
423        &self,
424        vertex: NodeIndex,
425        prune_factor: usize,
426        give_up_at_branch: bool,
427    ) -> Option<Vec<NodeIndex>>;
428
429    /**
430     * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex.
431     * However note that the path is reversed so that this vertex ends up at the end of the path.
432     * Also note that nodes are excluded if their pruning weight is less than the pruning factor.
433     *
434     * @param vertex         the original vertex
435     * @param pruneFactor    the prune factor to use in ignoring chain pieces
436     * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
437     * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
438     * has a descendant with multiple outgoing edges before hitting the reference path
439     */
440    fn find_path_downwards_to_highest_common_desendant_of_reference(
441        &self,
442        vertex: NodeIndex,
443        prune_factor: usize,
444        give_up_at_branch: bool,
445    ) -> Option<Vec<NodeIndex>>;
446
447    /**
448     * Finds the path in the graph from this vertex to the reference sink, including this vertex
449     *
450     * @param start           the reference vertex to start from
451     * @param direction       describes which direction to move in the graph (i.e. down to the reference sink or up to the source)
452     * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
453     * @return the path (non-null, non-empty)
454     */
455    fn get_reference_path(
456        &self,
457        start: NodeIndex,
458        direction: TraversalDirection,
459        blacklisted_edge: Option<EdgeIndex>,
460    ) -> Vec<NodeIndex>;
461
462    /**
463     * Get the unique vertex for kmer, or null if not possible.
464     *
465     * @param allowRefSource if true, we will allow kmer to match the reference source vertex
466     * @return a vertex for kmer, or null (either because it doesn't exist or is non-unique for graphs that have such a distinction)
467     */
468    fn get_kmer_vertex(&self, kmer: &Kmer, allow_ref_source: bool) -> Option<&NodeIndex>;
469
470    /**
471     * Create a new vertex for kmer.  Add it to the kmerToVertexMap map if appropriate.
472     *
473     * @param kmer the kmer we want to create a vertex for
474     * @return the non-null created vertex
475     */
476    fn create_vertex(&mut self, sequence: &[u8], kmer: Kmer) -> NodeIndex;
477
478    fn increase_counts_in_matched_kmers(
479        &mut self,
480        seqs_for_kmers: &SequenceForKmers,
481        vertex: NodeIndex,
482        original_kmer: &[u8],
483        offset: Option<usize>,
484    );
485
486    /**
487     * Workhorse routine of the assembler.  Given a sequence whose last vertex is anchored in the graph, extend
488     * the graph one bp according to the bases in sequence.
489     *
490     * @param prevVertex a non-null vertex where sequence was last anchored in the graph
491     * @param sequence   the sequence we're threading through the graph
492     * @param kmerStart  the start of the current kmer in graph we'd like to add
493     * @param count      the number of observations of this kmer in graph (can be > 1 for GGA)
494     * @param isRef      is this the reference sequence?
495     * @return a non-null vertex connecting prevVertex to in the graph based on sequence
496     */
497    fn extend_chain_by_one(
498        &mut self,
499        prev_vertex: NodeIndex,
500        sequence: &[u8],
501        kmer_start: usize,
502        count: usize,
503        is_ref: bool,
504    ) -> NodeIndex;
505
506    /**
507     * Finds a path starting from a given vertex and satisfying various predicates
508     *
509     * @param vertex      the original vertex
510     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
511     * @param done        test for whether a vertex is at the end of the path
512     * @param returnPath  test for whether to return a found path based on its terminal vertex
513     * @param nextEdge    function on vertices returning the next edge in the path
514     * @param nextNode    function of edges returning the next vertex in the path
515     * @return a path, if one satisfying all predicates is found, {@code null} otherwise
516     */
517    fn find_path(
518        &self,
519        vertex: NodeIndex,
520        prune_factor: usize,
521        done: &dyn Fn(NodeIndex) -> bool,
522        return_path: &dyn Fn(NodeIndex) -> bool,
523        next_edge: &dyn Fn(NodeIndex) -> EdgeIndex,
524        next_node: &dyn Fn(EdgeIndex) -> NodeIndex,
525    ) -> Option<Vec<NodeIndex>>;
526
527    fn has_incident_ref_edge(&self, v: NodeIndex) -> bool;
528
529    fn get_heaviest_edge(&self, v: NodeIndex, direction: Direction) -> EdgeIndex;
530
531    /**
532     * The base sequence for the given path.
533     *
534     * @param path         the list of vertexes that make up the path
535     * @param expandSource if true and if we encounter a source node, then expand (and reverse) the character sequence for that node
536     * @return non-null sequence of bases corresponding to the given path
537     */
538    fn get_bases_for_path(&self, path: &Vec<NodeIndex>, expand_source: bool) -> String;
539
540    fn extend_dangling_path_against_reference(
541        &mut self,
542        dangling_head_merge_result: &mut DanglingChainMergeHelper,
543        num_nodes_to_extend: usize,
544    ) -> bool;
545
546    fn remove_paths_not_connected_to_ref(&mut self);
547
548    fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct>;
549
550    // Method that will be called immediately before haplotype finding in the event there are
551    // alteations that must be made to the graph based on implementation
552    fn post_process_for_haplotype_finding<L: Locatable>(
553        &mut self,
554        debug_graph_output_path: Option<&String>,
555        ref_haplotype: &L,
556    );
557
558    fn find_kmer(&self, kmer: &Kmer) -> Option<&NodeIndex>;
559
560    fn set_increase_counts_through_branches(&mut self, increase_counts_through_branches: bool);
561
562    fn set_min_matching_bases_to_dangling_end_recovery(&mut self, min_matching_bases: i32);
563}
564
565pub enum TraversalDirection {
566    Downwards,
567    Upwards,
568}
569
570/**
571 * Keeps track of the information needed to add a sequence to the read threading assembly graph
572 */
573#[derive(Debug, Clone)]
574pub struct SequenceForKmers<'a> {
575    pub name: String,
576    pub sequence: &'a [u8],
577    pub start: usize,
578    pub stop: usize,
579    pub count: usize,
580    pub is_ref: bool,
581}
582
583impl<'a> SequenceForKmers<'a> {
584    /**
585     * Create a new sequence for creating kmers
586     */
587    pub fn new(
588        name: String,
589        sequence: &'a [u8],
590        start: usize,
591        stop: usize,
592        count: usize,
593        is_ref: bool,
594    ) -> SequenceForKmers {
595        SequenceForKmers {
596            name,
597            sequence,
598            start,
599            stop,
600            count,
601            is_ref,
602        }
603    }
604}
605
606/**
607 * Class to keep track of the important dangling chain merging data
608 */
609pub struct DanglingChainMergeHelper {
610    pub dangling_path: Vec<NodeIndex>,
611    pub reference_path: Vec<NodeIndex>,
612    pub dangling_path_string: String,
613    pub reference_path_string: String,
614    pub cigar: CigarString,
615}
616
617impl DanglingChainMergeHelper {
618    pub fn new(
619        dangling_path: Vec<NodeIndex>,
620        reference_path: Vec<NodeIndex>,
621        dangling_path_string: String,
622        reference_path_string: String,
623        cigar: CigarString,
624    ) -> DanglingChainMergeHelper {
625        DanglingChainMergeHelper {
626            dangling_path,
627            reference_path,
628            dangling_path_string,
629            reference_path_string,
630            cigar,
631        }
632    }
633}
lorikeet_genome/read_threading/abstract_read_threading_graph.rs

lorikeet_genome/read_threading/
abstract_read_threading_graph.rs