lorikeet_genome/read_threading/
read_threading_graph.rs

1use compare::{Compare, Extract};
2use gkl::smithwaterman::{OverhangStrategy, Parameters};
3use hashlink::LinkedHashMap;
4use petgraph::stable_graph::{EdgeIndex, NodeIndex};
5use petgraph::visit::EdgeRef;
6use petgraph::Direction;
7use rayon::prelude::*;
8use rust_htslib::bam::record::Cigar;
9use std::cmp::{max, min};
10use std::collections::HashSet;
11
12use crate::assembly::kmer::Kmer;
13use crate::graphs::base_edge::BaseEdge;
14use crate::graphs::base_edge::BaseEdgeStruct;
15use crate::graphs::base_graph::BaseGraph;
16use crate::graphs::base_vertex::BaseVertex;
17use crate::graphs::multi_sample_edge::MultiSampleEdge;
18use crate::graphs::seq_graph::SeqGraph;
19use crate::pair_hmm::pair_hmm_likelihood_calculation_engine::AVXMode;
20use crate::read_threading::abstract_read_threading_graph::{
21    AbstractReadThreadingGraph, DanglingChainMergeHelper, SequenceForKmers, TraversalDirection,
22};
23use crate::read_threading::multi_debruijn_vertex::MultiDeBruijnVertex;
24use crate::reads::alignment_utils::AlignmentUtils;
25use crate::reads::bird_tool_reads::BirdToolRead;
26use crate::reads::cigar_utils::CigarUtils;
27use crate::smith_waterman::smith_waterman_aligner::SmithWatermanAligner;
28use crate::utils::simple_interval::Locatable;
29
30/**
31 * Note: not final but only intended to be subclassed for testing.
32 */
33#[derive(Debug, Clone)]
34pub struct ReadThreadingGraph {
35    /**
36     * A set of non-unique kmers that cannot be used as merge points in the graph
37     */
38    non_unique_kmers: HashSet<Kmer>,
39    min_matching_bases_to_dangling_end_recovery: i32,
40    counter: usize,
41    /**
42     * Sequences added for read threading before we've actually built the graph
43     */
44    // pending: LinkedHashMap<usize, Vec<SequenceForKmers>>,
45    /**
46     * A map from kmers -> their corresponding vertex in the graph
47     */
48    kmer_to_vertex_map: LinkedHashMap<Kmer, NodeIndex>,
49    debug_graph_transformations: bool,
50    min_base_quality_to_use_in_assembly: u8,
51    pub reference_path: Vec<NodeIndex>,
52    already_built: bool,
53    // --------------------------------------------------------------------------------
54    // state variables, initialized in setToInitialState()
55    // --------------------------------------------------------------------------------
56    ref_source: Option<Kmer>,
57    start_threading_only_at_existing_vertex: bool,
58    max_mismatches_in_dangling_head: i32,
59    increase_counts_through_branches: bool,
60    num_pruning_samples: usize,
61    pub(crate) base_graph: BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>,
62    avx_mode: AVXMode,
63}
64
65impl ReadThreadingGraph {
66    pub const ANONYMOUS_SAMPLE: &'static str = "XXX_UNNAMED_XXX";
67    const WRITE_GRAPH: bool = false;
68    // const DEBUG_NON_UNIQUE_CALC: bool = false;
69    pub const MAX_CIGAR_COMPLEXITY: usize = 3;
70    const INCREASE_COUNTS_BACKWARDS: bool = true;
71
72    pub fn new(
73        kmer_size: usize,
74        _debug_graph_transformations: bool,
75        min_base_quality_to_use_in_assembly: u8,
76        min_pruning_samples: usize,
77        min_matching_bases_to_dangling_end_recovery: i32,
78        avx_mode: AVXMode,
79    ) -> Self {
80        let base_graph = BaseGraph::new(kmer_size);
81        Self {
82            non_unique_kmers: HashSet::new(),
83            min_matching_bases_to_dangling_end_recovery,
84            counter: 0,
85            // pending: LinkedHashMap::new(),
86            kmer_to_vertex_map: LinkedHashMap::new(),
87            debug_graph_transformations: true,
88            min_base_quality_to_use_in_assembly,
89            reference_path: Vec::new(),
90            already_built: false,
91            ref_source: None,
92            start_threading_only_at_existing_vertex: false,
93            max_mismatches_in_dangling_head: -1,
94            increase_counts_through_branches: false,
95            num_pruning_samples: min_pruning_samples,
96            base_graph,
97            avx_mode,
98        }
99    }
100
101    pub fn default_with_kmer_size(kmer_size: usize) -> Self {
102        Self::new(kmer_size, false, 6, 1, -1, AVXMode::detect_mode())
103    }
104
105    /**
106     * Get the collection of non-unique kmers from sequence for kmer size kmerSize
107     * @param seqForKmers a sequence to get kmers from
108     * @param kmerSize the size of the kmers
109     * @return a non-null collection of non-unique kmers in sequence
110     */
111    pub fn determine_non_unique_kmers<'a>(
112        seq_for_kmers: &SequenceForKmers<'a>,
113        kmer_size: usize,
114    ) -> Vec<Kmer> {
115        // count up occurrences of kmers within each read
116        let mut all_kmers = HashSet::new();
117        let mut non_unique_kmers = Vec::new();
118
119        let stop_position = seq_for_kmers.stop.checked_sub(kmer_size);
120        match stop_position {
121            None => {
122                // pass
123            }
124            Some(stop_position) => {
125                for i in 0..=stop_position {
126                    // if seq_for_kmers.sequence.len() <= i + kmer_size {
127                    //     break;
128                    // }
129                    let kmer =
130                        Kmer::new_with_start_and_length(seq_for_kmers.sequence, i, kmer_size);
131                    if all_kmers.contains(&kmer) {
132                        non_unique_kmers.push(kmer);
133                    } else {
134                        all_kmers.insert(kmer);
135                    };
136                }
137            }
138        }
139
140        return non_unique_kmers;
141    }
142
143    /**
144     * Get the collection of all sequences for kmers across all samples in no particular order
145     * @return non-null Collection
146     */
147    fn get_all_pending_sequences<'a, 'b>(
148        pending: &'b LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
149    ) -> Vec<&'b SequenceForKmers<'a>> {
150        return pending
151            .values()
152            .flat_map(|one_sample_worth| one_sample_worth)
153            .collect::<Vec<&SequenceForKmers>>();
154    }
155
156    /**
157     * Compute the smallest kmer size >= minKmerSize and <= maxKmerSize that has no non-unique kmers
158     * among all sequences added to the current graph.  Will always return a result for maxKmerSize if
159     * all smaller kmers had non-unique kmers.
160     *
161     * @param kmerSize the kmer size to check for non-unique kmers of
162     * @return a non-null NonUniqueResult
163     */
164    fn determine_non_uniques(
165        &self,
166        kmer_size: usize,
167        with_non_uniques: Vec<&SequenceForKmers>,
168    ) -> HashSet<Kmer> {
169        // loop over all sequences that have non-unique kmers in them from the previous iterator
170        with_non_uniques
171            .iter()
172            .filter_map(|sequence_for_kmers| {
173                let non_uniques_from_seq =
174                    Self::determine_non_unique_kmers(
175                        sequence_for_kmers, 
176                        kmer_size
177                    );
178                if !non_uniques_from_seq.is_empty() {
179                    // keep track of the non-uniques for this kmerSize, and keep it in the list of sequences that have non-uniques
180                    Some(non_uniques_from_seq)
181                } else {
182                    None
183                }
184            })
185            .flat_map(|non_uniques| non_uniques.into_iter())
186            .collect::<HashSet<Kmer>>()
187    }
188
189    // /**
190    //  * Returns the pending sequences, swapping out the old values for an empty hashmap. This
191    //  * effectively clears out the result and gives ownership of pending the function call.
192    //  * Used to avoid cloning in the build_graph_if_necessary()
193    //  */
194    // pub fn get_pending(&mut self) -> LinkedHashMap<usize, Vec<SequenceForKmers>> {
195    //     std::mem::replace(&mut self.pending, LinkedHashMap::new())
196    // }
197    //
198    // fn replace_pending(&mut self, used_pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers>>) {
199    //     std::mem::swap(used_pending, &mut self.pending)
200    // }
201
202    /**
203     * Get the set of non-unique kmers in this graph.  For debugging purposes
204     * @return a non-null set of kmers
205     */
206    pub fn get_non_uniques(&mut self) -> &HashSet<Kmer> {
207        &self.non_unique_kmers
208    }
209}
210
211impl AbstractReadThreadingGraph for ReadThreadingGraph {
212    fn get_base_graph(&self) -> &BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
213        &self.base_graph
214    }
215
216    fn get_base_graph_mut(&mut self) -> &mut BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
217        &mut self.base_graph
218    }
219
220    fn find_kmer(&self, kmer: &Kmer) -> Option<&NodeIndex> {
221        self.kmer_to_vertex_map.get(kmer)
222    }
223
224    fn set_increase_counts_through_branches(&mut self, increase_counts_through_branches: bool) {
225        self.increase_counts_through_branches = increase_counts_through_branches;
226    }
227
228    fn set_min_matching_bases_to_dangling_end_recovery(&mut self, min_matching_bases: i32) {
229        self.min_matching_bases_to_dangling_end_recovery = min_matching_bases;
230    }
231
232    /**
233     * Checks whether a kmer can be the threading start based on the current threading start location policy.
234     *
235     * @param kmer the query kmer.
236     * @return {@code true} if we can start thread the sequence at this kmer, {@code false} otherwise.
237     * @see #setThreadingStartOnlyAtExistingVertex(boolean)
238     */
239    fn is_threading_start(
240        &self,
241        kmer: &Kmer,
242    ) -> bool {
243        if self.start_threading_only_at_existing_vertex {
244            self.kmer_to_vertex_map.contains_key(kmer)
245        } else {
246            !self.non_unique_kmers.contains(kmer)
247        }
248    }
249
250    fn has_cycles(&self) -> bool {
251        self.base_graph.has_cycles()
252    }
253
254    /**
255     * Does the graph not have enough complexity?  We define low complexity
256     * as a situation where the number of non-unique kmers is more than 20%
257     * of the total number of kmers.
258     *
259     * @return true if the graph has low complexity, false otherwise
260     */
261    fn is_low_quality_graph(&self) -> bool {
262        return self.non_unique_kmers.len() * 4 > self.kmer_to_vertex_map.len();
263    }
264
265    // get the next kmerVertex for ChainExtension and validate if necessary.
266    fn get_next_kmer_vertex_for_chain_extension(
267        &self,
268        kmer: &Kmer,
269        is_ref: bool,
270        _prev_vertex: NodeIndex,
271    ) -> Option<&NodeIndex> {
272        let unique_merge_vertex = self.get_kmer_vertex(kmer, false);
273        assert!(
274            !(is_ref && unique_merge_vertex.is_some()),
275            "Did not find a unique vertex to merge into the reference path: {} {}", is_ref, unique_merge_vertex.is_some()
276        );
277
278        return unique_merge_vertex;
279    }
280
281    /**
282     * Since we want to duplicate non-unique kmers in the graph code we must determine what those kmers are
283     */
284    fn preprocess_reads<'a>(&mut self, pending: &LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>) {
285        // debug!(
286        //     "All pending before preprocess {}",
287        //     Self::get_all_pending_sequences(pending).len()
288        // );
289        self.non_unique_kmers = self.determine_non_uniques(
290            self.base_graph.get_kmer_size(),
291            Self::get_all_pending_sequences(pending),
292        );
293        // debug!(
294        //     "All pending after preprocess {}",
295        //     Self::get_all_pending_sequences(pending).len()
296        // );
297        // debug!("Non-uniques {}", self.non_unique_kmers.len());
298    }
299
300    // whether reads are needed after graph construction
301    fn should_remove_reads_after_graph_construction(&self) -> bool {
302        return true;
303    }
304
305    /**
306     * Add bases in sequence to this graph
307     *
308     * @param seqName  a useful seqName for this read, for debugging purposes
309     * @param sequence non-null sequence of bases
310     * @param start    the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
311     * @param stop     the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
312     * @param count    the representative count of this sequence (to use as the weight)
313     * @param isRef    is this the reference sequence.
314     */
315    fn add_sequence<'a>(
316        &self,
317        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
318        seq_name: String,
319        sample_index: usize,
320        sequence: &'a [u8],
321        start: usize,
322        stop: usize,
323        count: usize,
324        is_ref: bool,
325    ) {
326        // note that argument testing is taken care of in SequenceForKmers
327        // get the list of sequences for this sample
328        let sample_sequences = pending.entry(sample_index).or_insert(Vec::new());
329        // add the new sequence to the list of sequences for sample
330        sample_sequences.push(SequenceForKmers::new(
331            seq_name, sequence, start, stop, count, is_ref,
332        ))
333    }
334
335    /**
336     * Add a read to the sequence graph.  Finds maximal consecutive runs of bases with sufficient quality
337     * and applies {@see addSequence} to these subreads if they are longer than the kmer size.
338     *
339     * @param read a non-null read
340     */
341    fn add_read<'a>(
342        &mut self,
343        read: &'a BirdToolRead,
344        _sample_names: &[String],
345        count: &mut usize,
346        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
347    ) {
348        let sequence = read.seq();
349        let qualities = read.read.qual();
350        if sequence.is_empty() || qualities.is_empty() {
351            return;
352        }
353
354        let mut last_good = -1;
355        for end in 0..=sequence.len() {
356            if end as usize == sequence.len()
357                || !self.base_is_usable_for_assembly(sequence[end], qualities[end])
358            {
359                // the first good base is at lastGood, can be -1 if last base was bad
360                let start = last_good;
361                // the stop base is end - 1 (if we're not at the end of the sequence)
362                let len = end as i32 - start;
363
364                if start != -1 && len >= self.base_graph.get_kmer_size() as i32 {
365                    // if the sequence is long enough to get some value out of, add it to the graph
366                    let name = format!(
367                        "{}_{}_{}",
368                        std::str::from_utf8(read.read.qname()).unwrap(),
369                        start,
370                        end
371                    );
372                    // debug!("Read {name}");
373                    self.add_sequence(
374                        pending,
375                        name,
376                        read.sample_index,
377                        sequence,
378                        start as usize,
379                        end,
380                        1,
381                        false,
382                    );
383                    *count += 1;
384                }
385                last_good = -1;
386            } else if last_good == -1 {
387                last_good = end as i32;
388            }
389        }
390    }
391
392    /**
393     * Determines whether a base can safely be used for assembly.
394     * Currently disallows Ns and/or those with low quality
395     *
396     * @param base  the base under consideration
397     * @param qual  the quality of that base
398     * @return true if the base can be used for assembly, false otherwise
399     */
400    fn base_is_usable_for_assembly(&self, base: u8, qual: u8) -> bool {
401        return base.to_ascii_uppercase() != b'N'
402            && qual >= self.min_base_quality_to_use_in_assembly;
403    }
404
405    fn set_threading_start_only_at_existing_vertex(&mut self, value: bool) {
406        self.start_threading_only_at_existing_vertex = value
407    }
408
409    fn print_graph(&self, file_name: String, _prune_factor: usize) {
410        self.base_graph.print_graph(&file_name, true, 0);
411    }
412
413    /**
414     * Build the read threaded assembly graph if it hasn't already been constructed from the sequences that have
415     * been added to the graph.
416     */
417    fn build_graph_if_necessary<'a>(
418        &mut self,
419        pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
420    ) {
421        if self.already_built {
422            return;
423        }
424
425        // Capture the set of non-unique kmers for the given kmer size (if applicable)
426        self.preprocess_reads(&pending);
427
428        // let pending = self.get_pending();
429        // go through the pending sequences, and add them to the graph
430        for (_name, sequences_for_samples) in pending.iter() {
431            // debug!("Sample {} reads {}", *name, sequences_for_samples.len());
432            for sequence_for_kmers in sequences_for_samples.iter() {
433                self.thread_sequence(sequence_for_kmers);
434                if Self::WRITE_GRAPH {
435                    self.base_graph.print_graph(
436                        &format!(
437                            "threading.{}.{}.dot",
438                            self.counter,
439                            sequence_for_kmers.name.replace(" ", "_")
440                        ),
441                        true,
442                        0,
443                    );
444                }
445                self.counter += 1;
446            }
447            // debug!(
448            //     "Threaded {} Nodes {} Edges {}",
449            //     self.counter,
450            //     self.base_graph.graph.node_count(),
451            //     self.base_graph.graph.edge_count()
452            // );
453            // flush the single sample edge values from the graph
454            for e in self.base_graph.graph.edge_weights_mut() {
455                e.flush_single_sample_multiplicity()
456            }
457            // debug!(
458            //     "Flushed {} Nodes {} Edges {}",
459            //     self.counter,
460            //     self.base_graph.graph.node_count(),
461            //     self.base_graph.graph.edge_count()
462            // );
463        }
464
465        // self.replace_pending(&mut pending);
466
467        // clear the pending reads pile to conserve memory
468        // if self.should_remove_reads_after_graph_construction() {
469        //     self.pending.clear();
470        // }
471
472        self.already_built = true;
473        for (_, v) in self.kmer_to_vertex_map.iter_mut() {
474            let node = self.base_graph.graph.node_weight_mut(*v).unwrap();
475            node.set_additional_info(format!("{}+", node.get_additional_info()));
476        }
477    }
478
479    /**
480     * Thread sequence seqForKmers through the current graph, updating the graph as appropriate
481     *
482     * @param seqForKmers a non-null sequence
483     */
484    fn thread_sequence(&mut self, seq_for_kmers: &SequenceForKmers) {
485        let start_pos_option = self.find_start(seq_for_kmers);
486
487        match start_pos_option {
488            None => {
489                // do nothing :)
490            }
491            Some(start_pos) => {
492                if seq_for_kmers.sequence.len() <= start_pos + self.base_graph.get_kmer_size() {
493                    // debug!("Sequence length {}", seq_for_kmers.sequence.len());
494                    return;
495                }
496                let starting_vertex =
497                    self.get_or_create_kmer_vertex(&seq_for_kmers.sequence, start_pos);
498                // increase the counts of all edges incoming into the starting vertex supported by going back in sequence
499                if Self::INCREASE_COUNTS_BACKWARDS {
500                    // let original_kmer = self.base_graph.graph.node_weight(starting_vertex).unwrap().get_sequence().to_vec();
501                    self.increase_counts_in_matched_kmers(
502                        seq_for_kmers,
503                        starting_vertex,
504                        &seq_for_kmers.sequence,
505                        self.base_graph.get_kmer_size().checked_sub(2),
506                    );
507                }
508
509                if self.debug_graph_transformations {
510                    self.base_graph
511                        .graph
512                        .node_weight_mut(starting_vertex)
513                        .unwrap()
514                        .add_read(seq_for_kmers.name.clone());
515                }
516
517                // keep track of information about the reference source
518                if seq_for_kmers.is_ref {
519                    if self.ref_source.is_some() {
520                        panic!(
521                            "Found two ref_sources! prev: {:?}, new: {:?}",
522                            self.ref_source, seq_for_kmers
523                        )
524                    }
525                    self.reference_path = Vec::with_capacity(
526                        seq_for_kmers.sequence.len() - self.base_graph.get_kmer_size(),
527                    );
528                    self.reference_path.push(starting_vertex);
529                    self.ref_source = Some(Kmer::new_with_start_and_length(
530                        seq_for_kmers.sequence,
531                        seq_for_kmers.start,
532                        self.base_graph.get_kmer_size(),
533                    ));
534                };
535
536                let mut vertex = starting_vertex;
537
538                for i in start_pos + 1..=(seq_for_kmers.stop - self.base_graph.get_kmer_size()) {
539                    vertex = self.extend_chain_by_one(
540                        vertex,
541                        &seq_for_kmers.sequence,
542                        i,
543                        seq_for_kmers.count,
544                        seq_for_kmers.is_ref,
545                    );
546                    if seq_for_kmers.is_ref {
547                        self.reference_path.push(vertex);
548                    }
549                    if self.debug_graph_transformations {
550                        self.base_graph
551                            .graph
552                            .node_weight_mut(starting_vertex)
553                            .unwrap()
554                            .add_read(seq_for_kmers.name.clone());
555                    }
556                }
557                // TODO: Here GATK changes reference_path to an unmodifiable list, I'm not sure
558                //      if you can change mutability like that in Rust?
559            }
560        }
561    }
562
563    /**
564     * Find vertex and its position in seqForKmers where we should start assembling seqForKmers
565     *
566     * @param seqForKmers the sequence we want to thread into the graph
567     * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one
568     */
569    fn find_start(&self, seq_for_kmers: &SequenceForKmers) -> Option<usize> {
570        if seq_for_kmers.is_ref {
571            return Some(0);
572        } else {
573            let stop = seq_for_kmers.stop.saturating_sub(self.base_graph.get_kmer_size());
574
575            for i in seq_for_kmers.start..stop {
576                let kmer1 = Kmer::new_with_start_and_length(
577                    seq_for_kmers.sequence,
578                    i,
579                    self.base_graph.get_kmer_size(),
580                );
581                if self.is_threading_start(&kmer1) {
582                    return Some(i);
583                }
584            }
585
586            return None;
587        }
588    }
589
590    /**
591     * Get the vertex for the kmer in sequence starting at start
592     *
593     * @param sequence the sequence
594     * @param start    the position of the kmer start
595     * @return a non-null vertex
596     */
597    fn get_or_create_kmer_vertex(&mut self, sequence: &[u8], start: usize) -> NodeIndex {
598        let kmer =
599            Kmer::new_with_start_and_length(sequence, start, self.base_graph.get_kmer_size());
600        let vertex = self.get_kmer_vertex(&kmer, true);
601        match vertex {
602            None => self.create_vertex(sequence, kmer),
603            Some(vertex) => *vertex,
604        }
605    }
606
607    /**
608     * Get the unique vertex for kmer, or null if not possible.
609     *
610     * @param allowRefSource if true, we will allow kmer to match the reference source vertex
611     * @return a vertex for kmer, or null (either because it doesn't exist or is non-unique for graphs that have such a distinction)
612     */
613    fn get_kmer_vertex(&self, kmer: &Kmer, allow_ref_source: bool) -> Option<&NodeIndex> {
614        if !allow_ref_source && self.ref_source.as_ref() == Some(kmer) {
615            return None;
616        }
617        return self.kmer_to_vertex_map.get(kmer);
618    }
619
620    /**
621     * Create a new vertex for kmer.  Add it to the kmerToVertexMap map if appropriate.
622     *
623     * @param kmer the kmer we want to create a vertex for
624     * @return the non-null created vertex
625     */
626    fn create_vertex(&mut self, sequence: &[u8], kmer: Kmer) -> NodeIndex {
627        let new_vertex = MultiDeBruijnVertex::new(kmer.bases(sequence).to_vec(), false);
628        let node_index = self.base_graph.add_node(&new_vertex);
629        self.track_kmer(kmer, node_index);
630
631        return node_index;
632    }
633
634    // only add the new kmer to the map if it exists and isn't in our non-unique kmer list
635    fn track_kmer(&mut self, kmer: Kmer, new_vertex: NodeIndex) {
636        if !self.non_unique_kmers.contains(&kmer) && !self.kmer_to_vertex_map.contains_key(&kmer) {
637            self.kmer_to_vertex_map.insert(kmer, new_vertex);
638        };
639    }
640
641    fn increase_counts_in_matched_kmers(
642        &mut self,
643        seqs_for_kmers: &SequenceForKmers,
644        vertex: NodeIndex,
645        original_kmer: &[u8],
646        offset: Option<usize>,
647    ) {
648        match offset {
649            None => {} // do nothing :|,
650            Some(offset) => {
651                let outgoing_edges = self
652                    .base_graph
653                    .graph
654                    .edges_directed(vertex, Direction::Incoming)
655                    .map(|e| e.id())
656                    .collect::<Vec<EdgeIndex>>();
657
658                for edge in outgoing_edges {
659                    let prev = self.base_graph.get_edge_source(edge);
660                    let suffix = self
661                        .base_graph
662                        .graph
663                        .node_weight(prev)
664                        .unwrap()
665                        .get_suffix();
666                    let seq_base = original_kmer[offset];
667
668                    if suffix == seq_base
669                        && (self.increase_counts_through_branches
670                            || self.base_graph.in_degree_of(vertex) == 1)
671                    {
672                        self.base_graph
673                            .graph
674                            .edge_weight_mut(edge)
675                            .unwrap()
676                            .inc_multiplicity(seqs_for_kmers.count);
677                        self.increase_counts_in_matched_kmers(
678                            seqs_for_kmers,
679                            prev,
680                            original_kmer,
681                            offset.checked_sub(1),
682                        );
683                    }
684                }
685            }
686        }
687    }
688
689    /**
690     * Workhorse routine of the assembler.  Given a sequence whose last vertex is anchored in the graph, extend
691     * the graph one bp according to the bases in sequence.
692     *
693     * @param prevVertex a non-null vertex where sequence was last anchored in the graph
694     * @param sequence   the sequence we're threading through the graph
695     * @param kmerStart  the start of the current kmer in graph we'd like to add
696     * @param count      the number of observations of this kmer in graph (can be > 1 for GGA)
697     * @param isRef      is this the reference sequence?
698     * @return a non-null vertex connecting prevVertex to in the graph based on sequence
699     */
700    fn extend_chain_by_one(
701        &mut self,
702        prev_vertex: NodeIndex,
703        sequence: &[u8],
704        kmer_start: usize,
705        count: usize,
706        is_ref: bool,
707    ) -> NodeIndex {
708        let outgoing_edges = self
709            .base_graph
710            .graph
711            .edges_directed(prev_vertex, Direction::Outgoing);
712        let next_pos = kmer_start + self.base_graph.get_kmer_size() - 1;
713        let mut found_match = None;
714        let mut return_target = None;
715        for outgoing_edge in outgoing_edges {
716            let target = self
717                .base_graph
718                .graph
719                .edge_endpoints(outgoing_edge.id())
720                .unwrap()
721                .1;
722            if self
723                .base_graph
724                .graph
725                .node_weight(target)
726                .unwrap()
727                .get_suffix()
728                == sequence[next_pos]
729            {
730                found_match = Some(outgoing_edge.id());
731                return_target = Some(target);
732                // can't directly access the edge weight here, so we break and do it below
733                // as we need a mutable reference to it.
734                break;
735            }
736        }
737
738        if found_match.is_some() {
739            // we've got a match in the chain, so simply increase the count of the edge by 1 and return
740            // the target node index
741            self.base_graph
742                .graph
743                .edge_weight_mut(found_match.unwrap())
744                .unwrap()
745                .inc_multiplicity(count);
746            return return_target.unwrap();
747        }
748
749        // none of our outgoing edges had our unique suffix base, so we check for an opportunity to merge back in
750        let kmer =
751            Kmer::new_with_start_and_length(sequence, kmer_start, self.base_graph.get_kmer_size());
752        let merge_vertex =
753            self.get_next_kmer_vertex_for_chain_extension(&kmer, is_ref, prev_vertex);
754
755        let next_vertex = if merge_vertex.is_none() {
756            self.create_vertex(sequence, kmer)
757        } else {
758            *merge_vertex.unwrap()
759        };
760
761        let _edge_index = self.base_graph.graph.add_edge(
762            prev_vertex,
763            next_vertex,
764            MultiSampleEdge::new(is_ref, count, self.num_pruning_samples),
765        );
766        // debug!("adding edge {:?} : {:?} -> {:?}, ref {} count {}, prune {}", edge_index, prev_vertex, next_vertex, is_ref, count, self.num_pruning_samples);
767
768        return next_vertex;
769    }
770
771    /**
772     * Try to recover dangling tails
773     *
774     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
775     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
776     * @param recoverAll              recover even branches with forks
777     * @param aligner
778     */
779    fn recover_dangling_tails(
780        &mut self,
781        prune_factor: usize,
782        min_dangling_branch_length: i32,
783        recover_all: bool,
784        dangling_end_sw_parameters: &Parameters,
785    ) {
786        if !self.already_built {
787            panic!("recover_dangling_tails requires that graph already be built")
788        }
789
790        // we need to build a list of dangling tails because that process can modify the graph
791        // (and otherwise attempt to hold multiple mutable references if we do it while iterating over the vertexes)
792        let dangling_tails = self
793            .base_graph
794            .graph
795            .node_indices()
796            .filter(|v| self.base_graph.out_degree_of(*v) == 0 && !self.base_graph.is_ref_sink(*v))
797            .collect::<Vec<NodeIndex>>();
798
799        // let mut attempted = 0;
800        // let mut n_recovered = 0;
801        for v in dangling_tails {
802            // attempted += 1;
803            // n_recovered += 
804            self.recover_dangling_tail(
805                v,
806                prune_factor,
807                min_dangling_branch_length,
808                recover_all,
809                dangling_end_sw_parameters,
810            );
811        }
812
813        // debug!("Recovered {} of {} dangling tails", n_recovered, attempted);
814    }
815
816    /**
817     * Try to recover dangling heads
818     *
819     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
820     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
821     * @param recoverAll              recover even branches with forks
822     * @param aligner
823     */
824    fn recover_dangling_heads(
825        &mut self,
826        prune_factor: usize,
827        min_dangling_branch_length: i32,
828        recover_all: bool,
829        dangling_head_sw_parameters: &Parameters,
830    ) {
831        if !self.already_built {
832            panic!("recover_dangling_tails requires that graph already be built")
833        }
834
835        // we need to build a list of dangling heads because that process can modify the graph (and otherwise generate
836        // a ConcurrentModificationException if we do it while iterating over the vertexes)
837        let dangling_heads = self
838            .base_graph
839            .graph
840            .node_indices()
841            .filter(|v| self.base_graph.in_degree_of(*v) == 0 && !self.base_graph.is_ref_source(*v))
842            .collect::<Vec<NodeIndex>>();
843
844        // now we can try to recover the dangling heads
845        // let mut attempted = 0;
846        // let mut n_recovered = 0;
847        for v in dangling_heads {
848            // attempted += 1;
849            // n_recovered += 
850            self.recover_dangling_head(
851                v,
852                prune_factor,
853                min_dangling_branch_length,
854                recover_all,
855                dangling_head_sw_parameters,
856            );
857        }
858
859        // debug!("Recovered {} of {} dangling heads", n_recovered, attempted);
860    }
861
862    /**
863     * Attempt to attach vertex with out-degree == 0 to the graph
864     *
865     * @param vertex                  the vertex to recover
866     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
867     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
868     * @param aligner
869     * @return 1 if we successfully recovered the vertex and 0 otherwise
870     */
871    fn recover_dangling_tail(
872        &mut self,
873        vertex: NodeIndex,
874        prune_factor: usize,
875        min_dangling_branch_length: i32,
876        recover_all: bool,
877        dangling_tail_sw_parameters: &Parameters,
878    ) -> usize {
879        if self.base_graph.out_degree_of(vertex) != 0 {
880            panic!(
881                "Attempting to recover a dangling tail for {:?} but it has out-degree > 0",
882                vertex
883            )
884        };
885
886        // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
887        let dangling_tail_merge_result = self.generate_cigar_against_downwards_reference_path(
888            vertex,
889            prune_factor,
890            min_dangling_branch_length,
891            recover_all,
892            dangling_tail_sw_parameters,
893        );
894
895        match dangling_tail_merge_result {
896            None => return 0,
897            Some(dangling_tail_merge_result) => {
898                if !Self::cigar_is_okay_to_merge(&dangling_tail_merge_result.cigar, false, true) {
899                    return 0;
900                } else {
901                    self.merge_dangling_tail(dangling_tail_merge_result)
902                }
903            }
904        }
905    }
906
907    /**
908     * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph
909     *
910     * @param vertex                  the vertex to recover
911     * @param pruneFactor             the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
912     * @param minDanglingBranchLength the minimum length of a dangling branch for us to try to merge it
913     * @param recoverAll              recover even branches with forks
914     * @param aligner
915     * @return 1 if we successfully recovered a vertex and 0 otherwise
916     */
917    fn recover_dangling_head(
918        &mut self,
919        vertex: NodeIndex,
920        prune_factor: usize,
921        min_dangling_branch_length: i32,
922        recover_all: bool,
923        dangling_head_sw_parameters: &Parameters,
924    ) -> usize {
925        if self.base_graph.in_degree_of(vertex) != 0 {
926            panic!(
927                "Attempting to recover a dangling head for {:?} but it has in-degree > 0",
928                vertex
929            )
930        };
931
932        // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths
933        let dangling_head_merge_result = self.generate_cigar_against_upwards_reference_path(
934            vertex,
935            prune_factor,
936            min_dangling_branch_length,
937            recover_all,
938            dangling_head_sw_parameters,
939        );
940
941        match dangling_head_merge_result {
942            None => return 0,
943            Some(dangling_head_merge_result) => {
944                if !Self::cigar_is_okay_to_merge(&dangling_head_merge_result.cigar, true, false) {
945                    return 0;
946                } else if self.min_matching_bases_to_dangling_end_recovery >= 0 {
947                    self.merge_dangling_head(dangling_head_merge_result)
948                } else {
949                    self.merge_dangling_head_legacy(dangling_head_merge_result)
950                }
951            }
952        }
953    }
954
955    /**
956     * Actually merge the dangling tail if possible
957     *
958     * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference
959     * @return 1 if merge was successful, 0 otherwise
960     */
961    fn merge_dangling_tail(
962        &mut self,
963        dangling_tail_merge_result: DanglingChainMergeHelper,
964    ) -> usize {
965        let elements = &dangling_tail_merge_result.cigar.0;
966        let last_element = elements[elements.len() - 1];
967
968        // the last element must be an M
969        match last_element {
970            Cigar::Match(_) => {
971                // correct operator but more checks to do
972            }
973            _ => panic!("Last cigar element must be Match(_)"),
974        };
975
976        let last_ref_index =
977            CigarUtils::get_reference_length(&dangling_tail_merge_result.cigar) - 1;
978        let matching_suffix = min(
979            Self::longest_suffix_match(
980                dangling_tail_merge_result.reference_path_string.as_bytes(),
981                dangling_tail_merge_result.dangling_path_string.as_bytes(),
982                last_ref_index as i64,
983            ),
984            last_element.len() as usize,
985        );
986
987        if self.min_matching_bases_to_dangling_end_recovery >= 0 {
988            if matching_suffix < self.min_matching_bases_to_dangling_end_recovery as usize {
989                return 0;
990            }
991        } else if matching_suffix == 0 {
992            return 0;
993        }
994
995        let alt_index_to_merge = (CigarUtils::get_read_length(&dangling_tail_merge_result.cigar)
996            as usize)
997            .checked_sub(matching_suffix)
998            .unwrap_or(0)
999            .checked_sub(1)
1000            .unwrap_or(0);
1001
1002        // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a
1003        // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling
1004        // tail is a perfect match to the suffix of the reference path.  In this case we need to push the reference index to merge
1005        // down one position so that we don't incorrectly cut a base off of the deletion.
1006        let first_element_is_deletion = CigarUtils::cigar_elements_are_same_type(
1007            &dangling_tail_merge_result.cigar.0[0],
1008            &Some(Cigar::Del(0)),
1009        );
1010
1011        let must_handle_leading_deletion_case = first_element_is_deletion
1012            && ((&dangling_tail_merge_result.cigar.0[0].len() + matching_suffix as u32)
1013                == last_ref_index + 1);
1014        let ref_index_to_merge = last_ref_index as i32 - matching_suffix as i32
1015            + 1
1016            + if must_handle_leading_deletion_case {
1017                1
1018            } else {
1019                0
1020            };
1021
1022        // another edge condition occurs here: if Smith-Waterman places the whole tail into an insertion then it will try to
1023        // merge back to the LCA, which results in a cycle in the graph.  So we do not want to merge in such a case.
1024        if ref_index_to_merge <= 0 {
1025            return 0;
1026        }
1027
1028        // it's safe to merge now
1029        self.base_graph.graph.add_edge(
1030            dangling_tail_merge_result.dangling_path[alt_index_to_merge],
1031            dangling_tail_merge_result.reference_path[ref_index_to_merge as usize],
1032            MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1033        );
1034
1035        return 1;
1036    }
1037
1038    /**
1039     * Actually merge the dangling head if possible, this is the old codepath that does not handle indels
1040     *
1041     * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
1042     * @return 1 if merge was successful, 0 otherwise
1043     */
1044    fn merge_dangling_head_legacy(
1045        &mut self,
1046        mut dangling_head_merge_result: DanglingChainMergeHelper,
1047    ) -> usize {
1048        let first_element = &dangling_head_merge_result.cigar.0[0];
1049
1050        // the last element must be an M
1051        match first_element {
1052            Cigar::Match(_) => {
1053                // correct operator but more checks to do
1054            }
1055            _ => panic!("First cigar element must be Match(_)"),
1056        };
1057
1058        let indexes_to_merge = self.best_prefix_match_legacy(
1059            dangling_head_merge_result.reference_path_string.as_bytes(),
1060            dangling_head_merge_result.dangling_path_string.as_bytes(),
1061            first_element.len() as usize,
1062        );
1063
1064        match indexes_to_merge {
1065            None => return 0,
1066            Some(indexes_to_merge) => {
1067                // we can't push back the reference path
1068                if indexes_to_merge >= dangling_head_merge_result.reference_path.len() - 1 {
1069                    return 0;
1070                }
1071
1072                // but we can manipulate the dangling path if we need to
1073                let num_nodes_to_extend =
1074                    indexes_to_merge - dangling_head_merge_result.dangling_path.len() + 2;
1075
1076                if indexes_to_merge >= dangling_head_merge_result.dangling_path.len()
1077                    && !self.extend_dangling_path_against_reference(
1078                        &mut dangling_head_merge_result,
1079                        num_nodes_to_extend,
1080                    )
1081                {
1082                    return 0;
1083                }
1084
1085                // it's safe to merge now
1086                self.base_graph.graph.add_edge(
1087                    dangling_head_merge_result.reference_path[indexes_to_merge + 1 as usize],
1088                    dangling_head_merge_result.dangling_path[indexes_to_merge as usize],
1089                    MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1090                );
1091
1092                return 1;
1093            }
1094        }
1095    }
1096
1097    /**
1098     * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
1099     * Assumes that path1.length >= maxIndex and path2.length >= maxIndex.
1100     *
1101     * @param path1    the first path
1102     * @param path2    the second path
1103     * @param maxIndex the maximum index to traverse (not inclusive)
1104     * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex
1105     */
1106    fn best_prefix_match_legacy(
1107        &self,
1108        path1: &[u8],
1109        path2: &[u8],
1110        max_index: usize,
1111    ) -> Option<usize> {
1112        let max_mismatches = self.get_max_mismatches_legacy(max_index);
1113
1114        let mut mismatches = 0;
1115        let mut index = 0;
1116        let mut last_good_index = None;
1117
1118        while index < max_index {
1119            if path1[index] != path2[index] {
1120                mismatches += 1;
1121                if mismatches > max_mismatches {
1122                    return None;
1123                }
1124                last_good_index = Some(index);
1125            }
1126            index += 1;
1127        }
1128
1129        // if we got here then we hit the max index
1130        return last_good_index;
1131    }
1132
1133    /**
1134     * NOTE: this method is only used for dangling heads and not tails.
1135     *
1136     * Determine the maximum number of mismatches permitted on the branch.
1137     * Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size.
1138     *
1139     * @param lengthOfDanglingBranch the length of the branch itself
1140     * @return positive integer
1141     */
1142    fn get_max_mismatches_legacy(&self, length_of_dangling_branch: usize) -> usize {
1143        return if self.max_mismatches_in_dangling_head > 0 {
1144            self.max_mismatches_in_dangling_head as usize
1145        } else {
1146            max(
1147                1,
1148                length_of_dangling_branch / self.base_graph.get_kmer_size(),
1149            )
1150        };
1151    }
1152
1153    /**
1154     * Actually merge the dangling head if possible
1155     *
1156     * @param danglingHeadMergeResult   the result from generating a Cigar for the dangling head against the reference
1157     * @return 1 if merge was successful, 0 otherwise
1158     */
1159    fn merge_dangling_head(
1160        &mut self,
1161        mut dangling_head_merge_result: DanglingChainMergeHelper,
1162    ) -> usize {
1163        let first_element = &dangling_head_merge_result.cigar.0[0];
1164
1165        // the last element must be an M
1166        match first_element {
1167            Cigar::Match(_) => {
1168                // correct operator but more checks to do
1169            }
1170            _ => panic!("First cigar element must be Match(_)"),
1171        };
1172
1173        let indexes_to_merge = self.best_prefix_match(
1174            &dangling_head_merge_result.cigar.0,
1175            dangling_head_merge_result.reference_path_string.as_bytes(),
1176            dangling_head_merge_result.dangling_path_string.as_bytes(),
1177        );
1178
1179        if indexes_to_merge.0 <= 0 || indexes_to_merge.1 <= 0 {
1180            return 0;
1181        };
1182
1183        // we can't push back the reference path
1184        if indexes_to_merge.0 as usize >= dangling_head_merge_result.reference_path.len() - 1 {
1185            return 0;
1186        };
1187
1188        // but we can manipulate the dangling path if we need to
1189        let num_nodes_to_extend =
1190            indexes_to_merge.1 - dangling_head_merge_result.dangling_path.len() as i64 + 2;
1191        if indexes_to_merge.1 >= dangling_head_merge_result.dangling_path.len() as i64
1192            && !self.extend_dangling_path_against_reference(
1193                &mut dangling_head_merge_result,
1194                (num_nodes_to_extend) as usize,
1195            )
1196        {
1197            return 0;
1198        }
1199
1200        self.base_graph.graph.add_edge(
1201            dangling_head_merge_result.reference_path[(indexes_to_merge.0 + 1) as usize],
1202            dangling_head_merge_result.dangling_path[(indexes_to_merge.1) as usize],
1203            MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1204        );
1205
1206        return 1;
1207    }
1208
1209    fn extend_dangling_path_against_reference(
1210        &mut self,
1211        dangling_head_merge_result: &mut DanglingChainMergeHelper,
1212        num_nodes_to_extend: usize,
1213    ) -> bool {
1214        let index_of_last_dangling_node = dangling_head_merge_result.dangling_path.len() - 1;
1215        let offset_for_ref_end_to_dangling_end = &dangling_head_merge_result
1216            .cigar
1217            .0
1218            .iter()
1219            .map(|ce| {
1220                let a = if CigarUtils::cigar_consumes_reference_bases(ce) {
1221                    ce.len() as i64
1222                } else {
1223                    0
1224                };
1225
1226                let b = if CigarUtils::cigar_consumes_read_bases(ce) {
1227                    ce.len() as i64
1228                } else {
1229                    0
1230                };
1231
1232                a - b
1233            })
1234            .sum::<i64>();
1235
1236        let index_of_ref_node_to_use = index_of_last_dangling_node as i64
1237            + offset_for_ref_end_to_dangling_end
1238            + num_nodes_to_extend as i64;
1239
1240        if index_of_ref_node_to_use >= dangling_head_merge_result.reference_path.len() as i64
1241            || index_of_ref_node_to_use < 0
1242        {
1243            return false;
1244        }
1245
1246        let dangling_source = dangling_head_merge_result
1247            .dangling_path
1248            .remove(index_of_last_dangling_node);
1249        let ref_source_sequence = self
1250            .base_graph
1251            .graph
1252            .node_weight(
1253                dangling_head_merge_result.reference_path[index_of_ref_node_to_use as usize],
1254            )
1255            .unwrap()
1256            .get_sequence();
1257
1258        let mut sequence_to_extend = Vec::new();
1259        for i in 0..num_nodes_to_extend {
1260            sequence_to_extend.push(ref_source_sequence[i])
1261        }
1262        sequence_to_extend.extend_from_slice(
1263            self.base_graph
1264                .graph
1265                .node_weight(dangling_source)
1266                .unwrap()
1267                .get_sequence(),
1268        );
1269        // let mut sequence_to_extend = sb.as_bytes();
1270
1271        // clean up the source and edge
1272        let source_edge = self.get_heaviest_edge(dangling_source, Direction::Outgoing);
1273        let mut prev_v = self.base_graph.get_edge_target(source_edge);
1274        let removed_edge = self.base_graph.graph.remove_edge(source_edge).unwrap();
1275
1276        // extend the path
1277        // TODO: Make sure there are tests for this
1278        for i in (1..=num_nodes_to_extend).rev() {
1279            let new_v = MultiDeBruijnVertex::new_with_sequence(
1280                sequence_to_extend[i..i + self.base_graph.get_kmer_size()].to_vec(),
1281            );
1282            let new_v_index = self.base_graph.add_node(&new_v);
1283            let new_e =
1284                MultiSampleEdge::new(false, removed_edge.multiplicity, self.num_pruning_samples);
1285            let _new_e_index = self.base_graph.graph.add_edge(new_v_index, prev_v, new_e);
1286            dangling_head_merge_result.dangling_path.push(new_v_index);
1287            prev_v = new_v_index;
1288        }
1289
1290        return true;
1291    }
1292
1293    /**
1294     * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
1295     * Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read
1296     * at the end in order to emit an alignment offset.
1297     *
1298     * @param cigarElements cigar elements corresponding to the alignment between path1 and path2
1299     * @param path1  the first path
1300     * @param path2  the second path
1301     * @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found)
1302     */
1303    fn best_prefix_match(
1304        &self,
1305        cigar_elements: &Vec<Cigar>,
1306        path1: &[u8],
1307        path2: &[u8],
1308    ) -> (i64, i64) {
1309        let min_matching_bases = self.get_min_matching_bases() as i64;
1310
1311        let mut ref_idx = cigar_elements
1312            .iter()
1313            .map(|cigar| {
1314                if CigarUtils::cigar_consumes_reference_bases(cigar) {
1315                    cigar.len() as i64
1316                } else {
1317                    0
1318                }
1319            })
1320            .sum::<i64>()
1321            - 1;
1322        let mut read_idx = path2.len() as i64 - 1;
1323
1324        // NOTE: this only works when the last cigar element has a sufficient number of M bases,
1325        // so no indels within min-mismatches of the edge.
1326        'cigarLoop: for ce in cigar_elements.iter().rev() {
1327            if !(CigarUtils::cigar_consumes_reference_bases(ce)
1328                && CigarUtils::cigar_consumes_read_bases(ce))
1329            {
1330                break;
1331            } else {
1332                for _j in 0..(ce.len()) {
1333                    if path1[ref_idx as usize] != path2[read_idx as usize] {
1334                        break 'cigarLoop;
1335                    };
1336                    ref_idx -= 1;
1337                    read_idx -= 1;
1338                    if ref_idx < 0 || read_idx < 0 {
1339                        break 'cigarLoop;
1340                    }
1341                }
1342            }
1343        }
1344
1345        let matches = path2.len() as i64 - 1 - read_idx;
1346        return if matches < min_matching_bases {
1347            (-1, -1)
1348        } else {
1349            (ref_idx, read_idx)
1350        };
1351    }
1352
1353    /**
1354     * The minimum number of matches to be considered allowable for recovering dangling ends
1355     */
1356    fn get_min_matching_bases(&self) -> i32 {
1357        self.min_matching_bases_to_dangling_end_recovery
1358    }
1359
1360    /**
1361     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
1362     * provided vertex is the sink) and the reference path.
1363     *
1364     * @param aligner
1365     * @param vertex      the sink of the dangling chain
1366     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
1367     * @param recoverAll  recover even branches with forks
1368     * @return a SmithWaterman object which can be null if no proper alignment could be generated
1369     */
1370    fn generate_cigar_against_downwards_reference_path(
1371        &self,
1372        vertex: NodeIndex,
1373        prune_factor: usize,
1374        min_dangling_branch_length: i32,
1375        recover_all: bool,
1376        dangling_tail_sw_parameters: &Parameters,
1377    ) -> Option<DanglingChainMergeHelper> {
1378        let min_tail_path_length = max(1, min_dangling_branch_length); // while heads can be 0, tails absolutely cannot
1379
1380        // find the lowest common ancestor path between this vertex and the diverging master path if available
1381        let alt_path =
1382            self.find_path_upwards_to_lowest_common_ancestor(vertex, prune_factor, !recover_all);
1383        match alt_path {
1384            None => return None,
1385            Some(ref alt_path) => {
1386                if self.base_graph.is_ref_source(alt_path[0])
1387                    || (alt_path.len() as i32) < min_tail_path_length + 1
1388                {
1389                    return None;
1390                }
1391            }
1392        };
1393
1394        let alt_path = alt_path.unwrap();
1395        // now get the reference path from the LCA
1396        let ref_path = self.get_reference_path(
1397            alt_path[0],
1398            TraversalDirection::Downwards,
1399            Some(self.get_heaviest_edge(alt_path[1], Direction::Incoming)),
1400        );
1401
1402        // create the Smith-Waterman strings to use
1403        let ref_bases = self.get_bases_for_path(&ref_path, false);
1404        let alt_bases = self.get_bases_for_path(&alt_path, false);
1405
1406        // run Smith-Waterman to determine the best alignment
1407        // (and remove trailing deletions since they aren't interesting)
1408        let alignment = SmithWatermanAligner::align(
1409            ref_bases.as_bytes(),
1410            alt_bases.as_bytes(),
1411            dangling_tail_sw_parameters,
1412            OverhangStrategy::LeadingInDel,
1413            self.avx_mode,
1414        );
1415
1416        return Some(DanglingChainMergeHelper::new(
1417            alt_path,
1418            ref_path,
1419            alt_bases,
1420            ref_bases,
1421            AlignmentUtils::remove_trailing_deletions(alignment.cigar),
1422        ));
1423    }
1424
1425    /**
1426     * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
1427     * provided vertex is the source) and the reference path.
1428     *
1429     * @param aligner
1430     * @param vertex      the source of the dangling head
1431     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
1432     * @param recoverAll  recover even branches with forks
1433     * @return a SmithWaterman object which can be null if no proper alignment could be generated
1434     */
1435    fn generate_cigar_against_upwards_reference_path(
1436        &self,
1437        vertex: NodeIndex,
1438        prune_factor: usize,
1439        min_dangling_branch_length: i32,
1440        recover_all: bool,
1441        dangling_head_sw_parameters: &Parameters,
1442    ) -> Option<DanglingChainMergeHelper> {
1443        // find the highest common descendant path between vertex and the reference source if available
1444        let alt_path = self.find_path_downwards_to_highest_common_desendant_of_reference(
1445            vertex,
1446            prune_factor,
1447            !recover_all,
1448        );
1449
1450        match alt_path {
1451            None => return None,
1452            Some(ref alt_path) => {
1453                if self.base_graph.is_ref_sink(alt_path[0])
1454                    || (alt_path.len() as i32) < min_dangling_branch_length + 1
1455                {
1456                    return None;
1457                }
1458            }
1459        }
1460        let alt_path = alt_path.unwrap();
1461
1462        // now get the reference path from the LCA
1463        let ref_path = self.get_reference_path(alt_path[0], TraversalDirection::Upwards, None);
1464
1465        // create the Smith-Waterman strings to use
1466        let ref_bases = self.get_bases_for_path(&ref_path, true);
1467        let alt_bases = self.get_bases_for_path(&alt_path, true);
1468
1469        // run Smith-Waterman to determine the best alignment
1470        // (and remove trailing deletions since they aren't interesting)
1471        let alignment = SmithWatermanAligner::align(
1472            ref_bases.as_bytes(),
1473            alt_bases.as_bytes(),
1474            dangling_head_sw_parameters,
1475            OverhangStrategy::LeadingInDel,
1476            self.avx_mode,
1477        );
1478
1479        return Some(DanglingChainMergeHelper::new(
1480            alt_path,
1481            ref_path,
1482            alt_bases,
1483            ref_bases,
1484            AlignmentUtils::remove_trailing_deletions(alignment.cigar),
1485        ));
1486    }
1487
1488    /**
1489     * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex.
1490     * However note that the path is reversed so that this vertex ends up at the end of the path.
1491     * Also note that nodes are excluded if their pruning weight is less than the pruning factor.
1492     *
1493     * @param vertex         the original vertex
1494     * @param pruneFactor    the prune factor to use in ignoring chain pieces
1495     * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
1496     * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or
1497     * has a descendant with multiple outgoing edges before hitting the reference path
1498     */
1499    fn find_path_downwards_to_highest_common_desendant_of_reference(
1500        &self,
1501        vertex: NodeIndex,
1502        prune_factor: usize,
1503        give_up_at_branch: bool,
1504    ) -> Option<Vec<NodeIndex>> {
1505        return if give_up_at_branch {
1506            let done = |v: NodeIndex| -> bool {
1507                self.base_graph.is_reference_node(v) || self.base_graph.out_degree_of(v) != 1
1508            };
1509
1510            let return_path = |v: NodeIndex| -> bool { self.base_graph.is_reference_node(v) };
1511
1512            let next_edge =
1513                |v: NodeIndex| -> EdgeIndex { self.base_graph.outgoing_edge_of(v).unwrap() };
1514
1515            let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_target(e) };
1516
1517            self.find_path(
1518                vertex,
1519                prune_factor,
1520                &done,
1521                &return_path,
1522                &next_edge,
1523                &next_node,
1524            )
1525        } else {
1526            let done = |v: NodeIndex| -> bool {
1527                self.base_graph.is_reference_node(v) || self.base_graph.out_degree_of(v) == 0
1528            };
1529
1530            let return_path = |v: NodeIndex| -> bool { self.base_graph.is_reference_node(v) };
1531
1532            let next_edge =
1533                |v: NodeIndex| -> EdgeIndex { self.get_heaviest_edge(v, Direction::Outgoing) };
1534
1535            let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_target(e) };
1536
1537            self.find_path(
1538                vertex,
1539                prune_factor,
1540                &done,
1541                &return_path,
1542                &next_edge,
1543                &next_node,
1544            )
1545        };
1546    }
1547
1548    /**
1549     * Finds the path in the graph from this vertex to the reference sink, including this vertex
1550     *
1551     * @param start           the reference vertex to start from
1552     * @param direction       describes which direction to move in the graph (i.e. down to the reference sink or up to the source)
1553     * @param blacklistedEdge edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
1554     * @return the path (non-null, non-empty)
1555     */
1556    fn get_reference_path(
1557        &self,
1558        start: NodeIndex,
1559        direction: TraversalDirection,
1560        blacklisted_edge: Option<EdgeIndex>,
1561    ) -> Vec<NodeIndex> {
1562        let mut path = Vec::new();
1563
1564        let mut v = Some(start);
1565        while v.is_some() {
1566            path.push(v.unwrap());
1567            v = match direction {
1568                TraversalDirection::Downwards => {
1569                    self.base_graph
1570                        .get_next_reference_vertex(v, true, blacklisted_edge)
1571                }
1572                TraversalDirection::Upwards => self.base_graph.get_prev_reference_vertex(v),
1573            };
1574        }
1575
1576        return path;
1577    }
1578
1579    /**
1580     * Finds the path upwards in the graph from this vertex to the first diverging node, including that (lowest common ancestor) vertex.
1581     * Note that nodes are excluded if their pruning weight is less than the pruning factor.
1582     *
1583     * @param vertex         the original vertex
1584     * @param pruneFactor    the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
1585     * @param giveUpAtBranch stop trying to find a path if a vertex with multiple incoming or outgoing edge is found
1586     * @return the path if it can be determined or null if this vertex either doesn't merge onto another path or
1587     * has an ancestor with multiple incoming edges before hitting the reference path
1588     */
1589    fn find_path_upwards_to_lowest_common_ancestor(
1590        &self,
1591        vertex: NodeIndex,
1592        prune_factor: usize,
1593        give_up_at_branch: bool,
1594    ) -> Option<Vec<NodeIndex>> {
1595        return if give_up_at_branch {
1596            let done = |v: NodeIndex| -> bool {
1597                self.base_graph.in_degree_of(v) != 1 || self.base_graph.out_degree_of(v) >= 2
1598            };
1599
1600            let return_path = |v: NodeIndex| -> bool { self.base_graph.out_degree_of(v) > 1 };
1601
1602            let next_edge =
1603                |v: NodeIndex| -> EdgeIndex { self.base_graph.incoming_edge_of(v).unwrap() };
1604
1605            let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_source(e) };
1606
1607            self.find_path(
1608                vertex,
1609                prune_factor,
1610                &done,
1611                &return_path,
1612                &next_edge,
1613                &next_node,
1614            )
1615        } else {
1616            let done = |v: NodeIndex| -> bool {
1617                self.has_incident_ref_edge(v) || self.base_graph.in_degree_of(v) == 0
1618            };
1619
1620            let return_path = |v: NodeIndex| -> bool {
1621                self.base_graph.out_degree_of(v) > 1 && self.has_incident_ref_edge(v)
1622            };
1623
1624            let next_edge =
1625                |v: NodeIndex| -> EdgeIndex { self.get_heaviest_edge(v, Direction::Incoming) };
1626
1627            let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_source(e) };
1628
1629            self.find_path(
1630                vertex,
1631                prune_factor,
1632                &done,
1633                &return_path,
1634                &next_edge,
1635                &next_node,
1636            )
1637        };
1638    }
1639
1640    /**
1641     * Finds a path starting from a given vertex and satisfying various predicates
1642     *
1643     * @param vertex      the original vertex
1644     * @param pruneFactor the prune factor to use in ignoring chain pieces if edge multiplicity is < pruneFactor
1645     * @param done        test for whether a vertex is at the end of the path
1646     * @param returnPath  test for whether to return a found path based on its terminal vertex
1647     * @param nextEdge    function on vertices returning the next edge in the path
1648     * @param nextNode    function of edges returning the next vertex in the path
1649     * @return a path, if one satisfying all predicates is found, {@code null} otherwise
1650     */
1651    fn find_path(
1652        &self,
1653        vertex: NodeIndex,
1654        prune_factor: usize,
1655        done: &dyn Fn(NodeIndex) -> bool,
1656        return_path: &dyn Fn(NodeIndex) -> bool,
1657        next_edge: &dyn Fn(NodeIndex) -> EdgeIndex,
1658        next_node: &dyn Fn(EdgeIndex) -> NodeIndex,
1659    ) -> Option<Vec<NodeIndex>> {
1660        let mut path = Vec::new();
1661        let mut visited_nodes = HashSet::new();
1662
1663        let mut v = vertex;
1664        while !done(v) {
1665            let edge = next_edge(v);
1666            // if it has too low a weight, don't use it (or previous vertexes) for the path
1667            if self
1668                .base_graph
1669                .graph
1670                .edge_weight(edge)
1671                .unwrap()
1672                .get_pruning_multiplicity()
1673                < prune_factor
1674            {
1675                // save the previously visited notes to protect us from riding forever 'neath the streets of boston
1676                visited_nodes.par_extend(path);
1677                path = Vec::new();
1678            } else {
1679                path.insert(0, v);
1680            }
1681
1682            v = next_node(edge);
1683            // Check that we aren't stuck in a loop
1684            if path.contains(&v) || visited_nodes.contains(&v) {
1685                error!("Dangling end recovery killed because of a loop (find_path)");
1686                return None;
1687            }
1688        }
1689        path.insert(0, v);
1690
1691        return if return_path(v) { Some(path) } else { None };
1692    }
1693
1694    fn has_incident_ref_edge(&self, v: NodeIndex) -> bool {
1695        for edge in self.base_graph.graph.edges_directed(v, Direction::Incoming) {
1696            if self.base_graph.edge_is_ref(edge.id()) {
1697                return true;
1698            }
1699        }
1700        return false;
1701    }
1702
1703    fn get_heaviest_edge(&self, v: NodeIndex, direction: Direction) -> EdgeIndex {
1704        let edges = self
1705            .base_graph
1706            .graph
1707            .edges_directed(v, direction)
1708            .map(|e| e.id())
1709            .collect::<Vec<EdgeIndex>>();
1710        return if edges.len() == 1 {
1711            edges[0]
1712        } else {
1713            let comparator = Extract::new(|edge: &EdgeIndex| {
1714                self.base_graph
1715                    .graph
1716                    .edge_weight(*edge)
1717                    .unwrap()
1718                    .multiplicity
1719            });
1720
1721            *edges
1722                .iter()
1723                .max_by(|l, r| comparator.compare(l, r))
1724                .unwrap()
1725        };
1726    }
1727
1728    /**
1729     * The base sequence for the given path.
1730     *
1731     * @param path         the list of vertexes that make up the path
1732     * @param expandSource if true and if we encounter a source node, then expand (and reverse) the character sequence for that node
1733     * @return non-null sequence of bases corresponding to the given path
1734     */
1735    fn get_bases_for_path(&self, path: &Vec<NodeIndex>, expand_source: bool) -> String {
1736        assert!(!path.is_empty(), "Path cannot be empty");
1737
1738        let sb = path
1739            .par_iter()
1740            .map(|v| {
1741                if expand_source && self.base_graph.is_source(*v) {
1742                    let mut seq = self
1743                        .base_graph
1744                        .graph
1745                        .node_weight(*v)
1746                        .unwrap()
1747                        .get_sequence()
1748                        .to_vec();
1749                    seq.reverse();
1750                    std::str::from_utf8(&seq).unwrap().to_string()
1751                } else {
1752                    std::str::from_utf8(
1753                        self.base_graph
1754                            .graph
1755                            .node_weight(*v)
1756                            .unwrap()
1757                            .get_suffix_as_array(),
1758                    )
1759                    .unwrap()
1760                    .to_string()
1761                }
1762            })
1763            .collect::<String>();
1764
1765        return sb;
1766    }
1767
1768    fn remove_paths_not_connected_to_ref(&mut self) {
1769        self.base_graph.remove_paths_not_connected_to_ref()
1770    }
1771
1772    fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct> {
1773        self.base_graph.to_sequence_graph()
1774    }
1775
1776    fn get_kmer_size(&self) -> usize {
1777        self.base_graph.get_kmer_size()
1778    }
1779
1780    fn get_reference_source_vertex(&self) -> Option<NodeIndex> {
1781        self.base_graph.get_reference_source_vertex()
1782    }
1783
1784    fn get_reference_sink_vertex(&self) -> Option<NodeIndex> {
1785        self.base_graph.get_reference_sink_vertex()
1786    }
1787
1788    // Method that will be called immediately before haplotype finding in the event there are
1789    // alteations that must be made to the graph based on implementation
1790    fn post_process_for_haplotype_finding<L: Locatable>(
1791        &mut self,
1792        _debug_graph_output_path: Option<&String>,
1793        _ref_haplotype: &L,
1794    ) {
1795        // Do nothing There is no processing required for this graph so simply return
1796    }
1797}
lorikeet_genome/read_threading/read_threading_graph.rs

lorikeet_genome/read_threading/
read_threading_graph.rs