1use compare::{Compare, Extract};
2use gkl::smithwaterman::{OverhangStrategy, Parameters};
3use hashlink::LinkedHashMap;
4use petgraph::stable_graph::{EdgeIndex, NodeIndex};
5use petgraph::visit::EdgeRef;
6use petgraph::Direction;
7use rayon::prelude::*;
8use rust_htslib::bam::record::Cigar;
9use std::cmp::{max, min};
10use std::collections::HashSet;
11
12use crate::assembly::kmer::Kmer;
13use crate::graphs::base_edge::BaseEdge;
14use crate::graphs::base_edge::BaseEdgeStruct;
15use crate::graphs::base_graph::BaseGraph;
16use crate::graphs::base_vertex::BaseVertex;
17use crate::graphs::multi_sample_edge::MultiSampleEdge;
18use crate::graphs::seq_graph::SeqGraph;
19use crate::pair_hmm::pair_hmm_likelihood_calculation_engine::AVXMode;
20use crate::read_threading::abstract_read_threading_graph::{
21 AbstractReadThreadingGraph, DanglingChainMergeHelper, SequenceForKmers, TraversalDirection,
22};
23use crate::read_threading::multi_debruijn_vertex::MultiDeBruijnVertex;
24use crate::reads::alignment_utils::AlignmentUtils;
25use crate::reads::bird_tool_reads::BirdToolRead;
26use crate::reads::cigar_utils::CigarUtils;
27use crate::smith_waterman::smith_waterman_aligner::SmithWatermanAligner;
28use crate::utils::simple_interval::Locatable;
29
30#[derive(Debug, Clone)]
34pub struct ReadThreadingGraph {
35 non_unique_kmers: HashSet<Kmer>,
39 min_matching_bases_to_dangling_end_recovery: i32,
40 counter: usize,
41 kmer_to_vertex_map: LinkedHashMap<Kmer, NodeIndex>,
49 debug_graph_transformations: bool,
50 min_base_quality_to_use_in_assembly: u8,
51 pub reference_path: Vec<NodeIndex>,
52 already_built: bool,
53 ref_source: Option<Kmer>,
57 start_threading_only_at_existing_vertex: bool,
58 max_mismatches_in_dangling_head: i32,
59 increase_counts_through_branches: bool,
60 num_pruning_samples: usize,
61 pub(crate) base_graph: BaseGraph<MultiDeBruijnVertex, MultiSampleEdge>,
62 avx_mode: AVXMode,
63}
64
65impl ReadThreadingGraph {
66 pub const ANONYMOUS_SAMPLE: &'static str = "XXX_UNNAMED_XXX";
67 const WRITE_GRAPH: bool = false;
68 pub const MAX_CIGAR_COMPLEXITY: usize = 3;
70 const INCREASE_COUNTS_BACKWARDS: bool = true;
71
72 pub fn new(
73 kmer_size: usize,
74 _debug_graph_transformations: bool,
75 min_base_quality_to_use_in_assembly: u8,
76 min_pruning_samples: usize,
77 min_matching_bases_to_dangling_end_recovery: i32,
78 avx_mode: AVXMode,
79 ) -> Self {
80 let base_graph = BaseGraph::new(kmer_size);
81 Self {
82 non_unique_kmers: HashSet::new(),
83 min_matching_bases_to_dangling_end_recovery,
84 counter: 0,
85 kmer_to_vertex_map: LinkedHashMap::new(),
87 debug_graph_transformations: true,
88 min_base_quality_to_use_in_assembly,
89 reference_path: Vec::new(),
90 already_built: false,
91 ref_source: None,
92 start_threading_only_at_existing_vertex: false,
93 max_mismatches_in_dangling_head: -1,
94 increase_counts_through_branches: false,
95 num_pruning_samples: min_pruning_samples,
96 base_graph,
97 avx_mode,
98 }
99 }
100
101 pub fn default_with_kmer_size(kmer_size: usize) -> Self {
102 Self::new(kmer_size, false, 6, 1, -1, AVXMode::detect_mode())
103 }
104
105 pub fn determine_non_unique_kmers<'a>(
112 seq_for_kmers: &SequenceForKmers<'a>,
113 kmer_size: usize,
114 ) -> Vec<Kmer> {
115 let mut all_kmers = HashSet::new();
117 let mut non_unique_kmers = Vec::new();
118
119 let stop_position = seq_for_kmers.stop.checked_sub(kmer_size);
120 match stop_position {
121 None => {
122 }
124 Some(stop_position) => {
125 for i in 0..=stop_position {
126 let kmer =
130 Kmer::new_with_start_and_length(seq_for_kmers.sequence, i, kmer_size);
131 if all_kmers.contains(&kmer) {
132 non_unique_kmers.push(kmer);
133 } else {
134 all_kmers.insert(kmer);
135 };
136 }
137 }
138 }
139
140 return non_unique_kmers;
141 }
142
143 fn get_all_pending_sequences<'a, 'b>(
148 pending: &'b LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
149 ) -> Vec<&'b SequenceForKmers<'a>> {
150 return pending
151 .values()
152 .flat_map(|one_sample_worth| one_sample_worth)
153 .collect::<Vec<&SequenceForKmers>>();
154 }
155
156 fn determine_non_uniques(
165 &self,
166 kmer_size: usize,
167 with_non_uniques: Vec<&SequenceForKmers>,
168 ) -> HashSet<Kmer> {
169 with_non_uniques
171 .iter()
172 .filter_map(|sequence_for_kmers| {
173 let non_uniques_from_seq =
174 Self::determine_non_unique_kmers(
175 sequence_for_kmers,
176 kmer_size
177 );
178 if !non_uniques_from_seq.is_empty() {
179 Some(non_uniques_from_seq)
181 } else {
182 None
183 }
184 })
185 .flat_map(|non_uniques| non_uniques.into_iter())
186 .collect::<HashSet<Kmer>>()
187 }
188
189 pub fn get_non_uniques(&mut self) -> &HashSet<Kmer> {
207 &self.non_unique_kmers
208 }
209}
210
211impl AbstractReadThreadingGraph for ReadThreadingGraph {
212 fn get_base_graph(&self) -> &BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
213 &self.base_graph
214 }
215
216 fn get_base_graph_mut(&mut self) -> &mut BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
217 &mut self.base_graph
218 }
219
220 fn find_kmer(&self, kmer: &Kmer) -> Option<&NodeIndex> {
221 self.kmer_to_vertex_map.get(kmer)
222 }
223
224 fn set_increase_counts_through_branches(&mut self, increase_counts_through_branches: bool) {
225 self.increase_counts_through_branches = increase_counts_through_branches;
226 }
227
228 fn set_min_matching_bases_to_dangling_end_recovery(&mut self, min_matching_bases: i32) {
229 self.min_matching_bases_to_dangling_end_recovery = min_matching_bases;
230 }
231
232 fn is_threading_start(
240 &self,
241 kmer: &Kmer,
242 ) -> bool {
243 if self.start_threading_only_at_existing_vertex {
244 self.kmer_to_vertex_map.contains_key(kmer)
245 } else {
246 !self.non_unique_kmers.contains(kmer)
247 }
248 }
249
250 fn has_cycles(&self) -> bool {
251 self.base_graph.has_cycles()
252 }
253
254 fn is_low_quality_graph(&self) -> bool {
262 return self.non_unique_kmers.len() * 4 > self.kmer_to_vertex_map.len();
263 }
264
265 fn get_next_kmer_vertex_for_chain_extension(
267 &self,
268 kmer: &Kmer,
269 is_ref: bool,
270 _prev_vertex: NodeIndex,
271 ) -> Option<&NodeIndex> {
272 let unique_merge_vertex = self.get_kmer_vertex(kmer, false);
273 assert!(
274 !(is_ref && unique_merge_vertex.is_some()),
275 "Did not find a unique vertex to merge into the reference path: {} {}", is_ref, unique_merge_vertex.is_some()
276 );
277
278 return unique_merge_vertex;
279 }
280
281 fn preprocess_reads<'a>(&mut self, pending: &LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>) {
285 self.non_unique_kmers = self.determine_non_uniques(
290 self.base_graph.get_kmer_size(),
291 Self::get_all_pending_sequences(pending),
292 );
293 }
299
300 fn should_remove_reads_after_graph_construction(&self) -> bool {
302 return true;
303 }
304
305 fn add_sequence<'a>(
316 &self,
317 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
318 seq_name: String,
319 sample_index: usize,
320 sequence: &'a [u8],
321 start: usize,
322 stop: usize,
323 count: usize,
324 is_ref: bool,
325 ) {
326 let sample_sequences = pending.entry(sample_index).or_insert(Vec::new());
329 sample_sequences.push(SequenceForKmers::new(
331 seq_name, sequence, start, stop, count, is_ref,
332 ))
333 }
334
335 fn add_read<'a>(
342 &mut self,
343 read: &'a BirdToolRead,
344 _sample_names: &[String],
345 count: &mut usize,
346 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
347 ) {
348 let sequence = read.seq();
349 let qualities = read.read.qual();
350 if sequence.is_empty() || qualities.is_empty() {
351 return;
352 }
353
354 let mut last_good = -1;
355 for end in 0..=sequence.len() {
356 if end as usize == sequence.len()
357 || !self.base_is_usable_for_assembly(sequence[end], qualities[end])
358 {
359 let start = last_good;
361 let len = end as i32 - start;
363
364 if start != -1 && len >= self.base_graph.get_kmer_size() as i32 {
365 let name = format!(
367 "{}_{}_{}",
368 std::str::from_utf8(read.read.qname()).unwrap(),
369 start,
370 end
371 );
372 self.add_sequence(
374 pending,
375 name,
376 read.sample_index,
377 sequence,
378 start as usize,
379 end,
380 1,
381 false,
382 );
383 *count += 1;
384 }
385 last_good = -1;
386 } else if last_good == -1 {
387 last_good = end as i32;
388 }
389 }
390 }
391
392 fn base_is_usable_for_assembly(&self, base: u8, qual: u8) -> bool {
401 return base.to_ascii_uppercase() != b'N'
402 && qual >= self.min_base_quality_to_use_in_assembly;
403 }
404
405 fn set_threading_start_only_at_existing_vertex(&mut self, value: bool) {
406 self.start_threading_only_at_existing_vertex = value
407 }
408
409 fn print_graph(&self, file_name: String, _prune_factor: usize) {
410 self.base_graph.print_graph(&file_name, true, 0);
411 }
412
413 fn build_graph_if_necessary<'a>(
418 &mut self,
419 pending: &mut LinkedHashMap<usize, Vec<SequenceForKmers<'a>>>,
420 ) {
421 if self.already_built {
422 return;
423 }
424
425 self.preprocess_reads(&pending);
427
428 for (_name, sequences_for_samples) in pending.iter() {
431 for sequence_for_kmers in sequences_for_samples.iter() {
433 self.thread_sequence(sequence_for_kmers);
434 if Self::WRITE_GRAPH {
435 self.base_graph.print_graph(
436 &format!(
437 "threading.{}.{}.dot",
438 self.counter,
439 sequence_for_kmers.name.replace(" ", "_")
440 ),
441 true,
442 0,
443 );
444 }
445 self.counter += 1;
446 }
447 for e in self.base_graph.graph.edge_weights_mut() {
455 e.flush_single_sample_multiplicity()
456 }
457 }
464
465 self.already_built = true;
473 for (_, v) in self.kmer_to_vertex_map.iter_mut() {
474 let node = self.base_graph.graph.node_weight_mut(*v).unwrap();
475 node.set_additional_info(format!("{}+", node.get_additional_info()));
476 }
477 }
478
479 fn thread_sequence(&mut self, seq_for_kmers: &SequenceForKmers) {
485 let start_pos_option = self.find_start(seq_for_kmers);
486
487 match start_pos_option {
488 None => {
489 }
491 Some(start_pos) => {
492 if seq_for_kmers.sequence.len() <= start_pos + self.base_graph.get_kmer_size() {
493 return;
495 }
496 let starting_vertex =
497 self.get_or_create_kmer_vertex(&seq_for_kmers.sequence, start_pos);
498 if Self::INCREASE_COUNTS_BACKWARDS {
500 self.increase_counts_in_matched_kmers(
502 seq_for_kmers,
503 starting_vertex,
504 &seq_for_kmers.sequence,
505 self.base_graph.get_kmer_size().checked_sub(2),
506 );
507 }
508
509 if self.debug_graph_transformations {
510 self.base_graph
511 .graph
512 .node_weight_mut(starting_vertex)
513 .unwrap()
514 .add_read(seq_for_kmers.name.clone());
515 }
516
517 if seq_for_kmers.is_ref {
519 if self.ref_source.is_some() {
520 panic!(
521 "Found two ref_sources! prev: {:?}, new: {:?}",
522 self.ref_source, seq_for_kmers
523 )
524 }
525 self.reference_path = Vec::with_capacity(
526 seq_for_kmers.sequence.len() - self.base_graph.get_kmer_size(),
527 );
528 self.reference_path.push(starting_vertex);
529 self.ref_source = Some(Kmer::new_with_start_and_length(
530 seq_for_kmers.sequence,
531 seq_for_kmers.start,
532 self.base_graph.get_kmer_size(),
533 ));
534 };
535
536 let mut vertex = starting_vertex;
537
538 for i in start_pos + 1..=(seq_for_kmers.stop - self.base_graph.get_kmer_size()) {
539 vertex = self.extend_chain_by_one(
540 vertex,
541 &seq_for_kmers.sequence,
542 i,
543 seq_for_kmers.count,
544 seq_for_kmers.is_ref,
545 );
546 if seq_for_kmers.is_ref {
547 self.reference_path.push(vertex);
548 }
549 if self.debug_graph_transformations {
550 self.base_graph
551 .graph
552 .node_weight_mut(starting_vertex)
553 .unwrap()
554 .add_read(seq_for_kmers.name.clone());
555 }
556 }
557 }
560 }
561 }
562
563 fn find_start(&self, seq_for_kmers: &SequenceForKmers) -> Option<usize> {
570 if seq_for_kmers.is_ref {
571 return Some(0);
572 } else {
573 let stop = seq_for_kmers.stop.saturating_sub(self.base_graph.get_kmer_size());
574
575 for i in seq_for_kmers.start..stop {
576 let kmer1 = Kmer::new_with_start_and_length(
577 seq_for_kmers.sequence,
578 i,
579 self.base_graph.get_kmer_size(),
580 );
581 if self.is_threading_start(&kmer1) {
582 return Some(i);
583 }
584 }
585
586 return None;
587 }
588 }
589
590 fn get_or_create_kmer_vertex(&mut self, sequence: &[u8], start: usize) -> NodeIndex {
598 let kmer =
599 Kmer::new_with_start_and_length(sequence, start, self.base_graph.get_kmer_size());
600 let vertex = self.get_kmer_vertex(&kmer, true);
601 match vertex {
602 None => self.create_vertex(sequence, kmer),
603 Some(vertex) => *vertex,
604 }
605 }
606
607 fn get_kmer_vertex(&self, kmer: &Kmer, allow_ref_source: bool) -> Option<&NodeIndex> {
614 if !allow_ref_source && self.ref_source.as_ref() == Some(kmer) {
615 return None;
616 }
617 return self.kmer_to_vertex_map.get(kmer);
618 }
619
620 fn create_vertex(&mut self, sequence: &[u8], kmer: Kmer) -> NodeIndex {
627 let new_vertex = MultiDeBruijnVertex::new(kmer.bases(sequence).to_vec(), false);
628 let node_index = self.base_graph.add_node(&new_vertex);
629 self.track_kmer(kmer, node_index);
630
631 return node_index;
632 }
633
634 fn track_kmer(&mut self, kmer: Kmer, new_vertex: NodeIndex) {
636 if !self.non_unique_kmers.contains(&kmer) && !self.kmer_to_vertex_map.contains_key(&kmer) {
637 self.kmer_to_vertex_map.insert(kmer, new_vertex);
638 };
639 }
640
641 fn increase_counts_in_matched_kmers(
642 &mut self,
643 seqs_for_kmers: &SequenceForKmers,
644 vertex: NodeIndex,
645 original_kmer: &[u8],
646 offset: Option<usize>,
647 ) {
648 match offset {
649 None => {} Some(offset) => {
651 let outgoing_edges = self
652 .base_graph
653 .graph
654 .edges_directed(vertex, Direction::Incoming)
655 .map(|e| e.id())
656 .collect::<Vec<EdgeIndex>>();
657
658 for edge in outgoing_edges {
659 let prev = self.base_graph.get_edge_source(edge);
660 let suffix = self
661 .base_graph
662 .graph
663 .node_weight(prev)
664 .unwrap()
665 .get_suffix();
666 let seq_base = original_kmer[offset];
667
668 if suffix == seq_base
669 && (self.increase_counts_through_branches
670 || self.base_graph.in_degree_of(vertex) == 1)
671 {
672 self.base_graph
673 .graph
674 .edge_weight_mut(edge)
675 .unwrap()
676 .inc_multiplicity(seqs_for_kmers.count);
677 self.increase_counts_in_matched_kmers(
678 seqs_for_kmers,
679 prev,
680 original_kmer,
681 offset.checked_sub(1),
682 );
683 }
684 }
685 }
686 }
687 }
688
689 fn extend_chain_by_one(
701 &mut self,
702 prev_vertex: NodeIndex,
703 sequence: &[u8],
704 kmer_start: usize,
705 count: usize,
706 is_ref: bool,
707 ) -> NodeIndex {
708 let outgoing_edges = self
709 .base_graph
710 .graph
711 .edges_directed(prev_vertex, Direction::Outgoing);
712 let next_pos = kmer_start + self.base_graph.get_kmer_size() - 1;
713 let mut found_match = None;
714 let mut return_target = None;
715 for outgoing_edge in outgoing_edges {
716 let target = self
717 .base_graph
718 .graph
719 .edge_endpoints(outgoing_edge.id())
720 .unwrap()
721 .1;
722 if self
723 .base_graph
724 .graph
725 .node_weight(target)
726 .unwrap()
727 .get_suffix()
728 == sequence[next_pos]
729 {
730 found_match = Some(outgoing_edge.id());
731 return_target = Some(target);
732 break;
735 }
736 }
737
738 if found_match.is_some() {
739 self.base_graph
742 .graph
743 .edge_weight_mut(found_match.unwrap())
744 .unwrap()
745 .inc_multiplicity(count);
746 return return_target.unwrap();
747 }
748
749 let kmer =
751 Kmer::new_with_start_and_length(sequence, kmer_start, self.base_graph.get_kmer_size());
752 let merge_vertex =
753 self.get_next_kmer_vertex_for_chain_extension(&kmer, is_ref, prev_vertex);
754
755 let next_vertex = if merge_vertex.is_none() {
756 self.create_vertex(sequence, kmer)
757 } else {
758 *merge_vertex.unwrap()
759 };
760
761 let _edge_index = self.base_graph.graph.add_edge(
762 prev_vertex,
763 next_vertex,
764 MultiSampleEdge::new(is_ref, count, self.num_pruning_samples),
765 );
766 return next_vertex;
769 }
770
771 fn recover_dangling_tails(
780 &mut self,
781 prune_factor: usize,
782 min_dangling_branch_length: i32,
783 recover_all: bool,
784 dangling_end_sw_parameters: &Parameters,
785 ) {
786 if !self.already_built {
787 panic!("recover_dangling_tails requires that graph already be built")
788 }
789
790 let dangling_tails = self
793 .base_graph
794 .graph
795 .node_indices()
796 .filter(|v| self.base_graph.out_degree_of(*v) == 0 && !self.base_graph.is_ref_sink(*v))
797 .collect::<Vec<NodeIndex>>();
798
799 for v in dangling_tails {
802 self.recover_dangling_tail(
805 v,
806 prune_factor,
807 min_dangling_branch_length,
808 recover_all,
809 dangling_end_sw_parameters,
810 );
811 }
812
813 }
815
816 fn recover_dangling_heads(
825 &mut self,
826 prune_factor: usize,
827 min_dangling_branch_length: i32,
828 recover_all: bool,
829 dangling_head_sw_parameters: &Parameters,
830 ) {
831 if !self.already_built {
832 panic!("recover_dangling_tails requires that graph already be built")
833 }
834
835 let dangling_heads = self
838 .base_graph
839 .graph
840 .node_indices()
841 .filter(|v| self.base_graph.in_degree_of(*v) == 0 && !self.base_graph.is_ref_source(*v))
842 .collect::<Vec<NodeIndex>>();
843
844 for v in dangling_heads {
848 self.recover_dangling_head(
851 v,
852 prune_factor,
853 min_dangling_branch_length,
854 recover_all,
855 dangling_head_sw_parameters,
856 );
857 }
858
859 }
861
862 fn recover_dangling_tail(
872 &mut self,
873 vertex: NodeIndex,
874 prune_factor: usize,
875 min_dangling_branch_length: i32,
876 recover_all: bool,
877 dangling_tail_sw_parameters: &Parameters,
878 ) -> usize {
879 if self.base_graph.out_degree_of(vertex) != 0 {
880 panic!(
881 "Attempting to recover a dangling tail for {:?} but it has out-degree > 0",
882 vertex
883 )
884 };
885
886 let dangling_tail_merge_result = self.generate_cigar_against_downwards_reference_path(
888 vertex,
889 prune_factor,
890 min_dangling_branch_length,
891 recover_all,
892 dangling_tail_sw_parameters,
893 );
894
895 match dangling_tail_merge_result {
896 None => return 0,
897 Some(dangling_tail_merge_result) => {
898 if !Self::cigar_is_okay_to_merge(&dangling_tail_merge_result.cigar, false, true) {
899 return 0;
900 } else {
901 self.merge_dangling_tail(dangling_tail_merge_result)
902 }
903 }
904 }
905 }
906
907 fn recover_dangling_head(
918 &mut self,
919 vertex: NodeIndex,
920 prune_factor: usize,
921 min_dangling_branch_length: i32,
922 recover_all: bool,
923 dangling_head_sw_parameters: &Parameters,
924 ) -> usize {
925 if self.base_graph.in_degree_of(vertex) != 0 {
926 panic!(
927 "Attempting to recover a dangling head for {:?} but it has in-degree > 0",
928 vertex
929 )
930 };
931
932 let dangling_head_merge_result = self.generate_cigar_against_upwards_reference_path(
934 vertex,
935 prune_factor,
936 min_dangling_branch_length,
937 recover_all,
938 dangling_head_sw_parameters,
939 );
940
941 match dangling_head_merge_result {
942 None => return 0,
943 Some(dangling_head_merge_result) => {
944 if !Self::cigar_is_okay_to_merge(&dangling_head_merge_result.cigar, true, false) {
945 return 0;
946 } else if self.min_matching_bases_to_dangling_end_recovery >= 0 {
947 self.merge_dangling_head(dangling_head_merge_result)
948 } else {
949 self.merge_dangling_head_legacy(dangling_head_merge_result)
950 }
951 }
952 }
953 }
954
955 fn merge_dangling_tail(
962 &mut self,
963 dangling_tail_merge_result: DanglingChainMergeHelper,
964 ) -> usize {
965 let elements = &dangling_tail_merge_result.cigar.0;
966 let last_element = elements[elements.len() - 1];
967
968 match last_element {
970 Cigar::Match(_) => {
971 }
973 _ => panic!("Last cigar element must be Match(_)"),
974 };
975
976 let last_ref_index =
977 CigarUtils::get_reference_length(&dangling_tail_merge_result.cigar) - 1;
978 let matching_suffix = min(
979 Self::longest_suffix_match(
980 dangling_tail_merge_result.reference_path_string.as_bytes(),
981 dangling_tail_merge_result.dangling_path_string.as_bytes(),
982 last_ref_index as i64,
983 ),
984 last_element.len() as usize,
985 );
986
987 if self.min_matching_bases_to_dangling_end_recovery >= 0 {
988 if matching_suffix < self.min_matching_bases_to_dangling_end_recovery as usize {
989 return 0;
990 }
991 } else if matching_suffix == 0 {
992 return 0;
993 }
994
995 let alt_index_to_merge = (CigarUtils::get_read_length(&dangling_tail_merge_result.cigar)
996 as usize)
997 .checked_sub(matching_suffix)
998 .unwrap_or(0)
999 .checked_sub(1)
1000 .unwrap_or(0);
1001
1002 let first_element_is_deletion = CigarUtils::cigar_elements_are_same_type(
1007 &dangling_tail_merge_result.cigar.0[0],
1008 &Some(Cigar::Del(0)),
1009 );
1010
1011 let must_handle_leading_deletion_case = first_element_is_deletion
1012 && ((&dangling_tail_merge_result.cigar.0[0].len() + matching_suffix as u32)
1013 == last_ref_index + 1);
1014 let ref_index_to_merge = last_ref_index as i32 - matching_suffix as i32
1015 + 1
1016 + if must_handle_leading_deletion_case {
1017 1
1018 } else {
1019 0
1020 };
1021
1022 if ref_index_to_merge <= 0 {
1025 return 0;
1026 }
1027
1028 self.base_graph.graph.add_edge(
1030 dangling_tail_merge_result.dangling_path[alt_index_to_merge],
1031 dangling_tail_merge_result.reference_path[ref_index_to_merge as usize],
1032 MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1033 );
1034
1035 return 1;
1036 }
1037
1038 fn merge_dangling_head_legacy(
1045 &mut self,
1046 mut dangling_head_merge_result: DanglingChainMergeHelper,
1047 ) -> usize {
1048 let first_element = &dangling_head_merge_result.cigar.0[0];
1049
1050 match first_element {
1052 Cigar::Match(_) => {
1053 }
1055 _ => panic!("First cigar element must be Match(_)"),
1056 };
1057
1058 let indexes_to_merge = self.best_prefix_match_legacy(
1059 dangling_head_merge_result.reference_path_string.as_bytes(),
1060 dangling_head_merge_result.dangling_path_string.as_bytes(),
1061 first_element.len() as usize,
1062 );
1063
1064 match indexes_to_merge {
1065 None => return 0,
1066 Some(indexes_to_merge) => {
1067 if indexes_to_merge >= dangling_head_merge_result.reference_path.len() - 1 {
1069 return 0;
1070 }
1071
1072 let num_nodes_to_extend =
1074 indexes_to_merge - dangling_head_merge_result.dangling_path.len() + 2;
1075
1076 if indexes_to_merge >= dangling_head_merge_result.dangling_path.len()
1077 && !self.extend_dangling_path_against_reference(
1078 &mut dangling_head_merge_result,
1079 num_nodes_to_extend,
1080 )
1081 {
1082 return 0;
1083 }
1084
1085 self.base_graph.graph.add_edge(
1087 dangling_head_merge_result.reference_path[indexes_to_merge + 1 as usize],
1088 dangling_head_merge_result.dangling_path[indexes_to_merge as usize],
1089 MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1090 );
1091
1092 return 1;
1093 }
1094 }
1095 }
1096
1097 fn best_prefix_match_legacy(
1107 &self,
1108 path1: &[u8],
1109 path2: &[u8],
1110 max_index: usize,
1111 ) -> Option<usize> {
1112 let max_mismatches = self.get_max_mismatches_legacy(max_index);
1113
1114 let mut mismatches = 0;
1115 let mut index = 0;
1116 let mut last_good_index = None;
1117
1118 while index < max_index {
1119 if path1[index] != path2[index] {
1120 mismatches += 1;
1121 if mismatches > max_mismatches {
1122 return None;
1123 }
1124 last_good_index = Some(index);
1125 }
1126 index += 1;
1127 }
1128
1129 return last_good_index;
1131 }
1132
1133 fn get_max_mismatches_legacy(&self, length_of_dangling_branch: usize) -> usize {
1143 return if self.max_mismatches_in_dangling_head > 0 {
1144 self.max_mismatches_in_dangling_head as usize
1145 } else {
1146 max(
1147 1,
1148 length_of_dangling_branch / self.base_graph.get_kmer_size(),
1149 )
1150 };
1151 }
1152
1153 fn merge_dangling_head(
1160 &mut self,
1161 mut dangling_head_merge_result: DanglingChainMergeHelper,
1162 ) -> usize {
1163 let first_element = &dangling_head_merge_result.cigar.0[0];
1164
1165 match first_element {
1167 Cigar::Match(_) => {
1168 }
1170 _ => panic!("First cigar element must be Match(_)"),
1171 };
1172
1173 let indexes_to_merge = self.best_prefix_match(
1174 &dangling_head_merge_result.cigar.0,
1175 dangling_head_merge_result.reference_path_string.as_bytes(),
1176 dangling_head_merge_result.dangling_path_string.as_bytes(),
1177 );
1178
1179 if indexes_to_merge.0 <= 0 || indexes_to_merge.1 <= 0 {
1180 return 0;
1181 };
1182
1183 if indexes_to_merge.0 as usize >= dangling_head_merge_result.reference_path.len() - 1 {
1185 return 0;
1186 };
1187
1188 let num_nodes_to_extend =
1190 indexes_to_merge.1 - dangling_head_merge_result.dangling_path.len() as i64 + 2;
1191 if indexes_to_merge.1 >= dangling_head_merge_result.dangling_path.len() as i64
1192 && !self.extend_dangling_path_against_reference(
1193 &mut dangling_head_merge_result,
1194 (num_nodes_to_extend) as usize,
1195 )
1196 {
1197 return 0;
1198 }
1199
1200 self.base_graph.graph.add_edge(
1201 dangling_head_merge_result.reference_path[(indexes_to_merge.0 + 1) as usize],
1202 dangling_head_merge_result.dangling_path[(indexes_to_merge.1) as usize],
1203 MultiSampleEdge::new(false, 1, self.num_pruning_samples),
1204 );
1205
1206 return 1;
1207 }
1208
1209 fn extend_dangling_path_against_reference(
1210 &mut self,
1211 dangling_head_merge_result: &mut DanglingChainMergeHelper,
1212 num_nodes_to_extend: usize,
1213 ) -> bool {
1214 let index_of_last_dangling_node = dangling_head_merge_result.dangling_path.len() - 1;
1215 let offset_for_ref_end_to_dangling_end = &dangling_head_merge_result
1216 .cigar
1217 .0
1218 .iter()
1219 .map(|ce| {
1220 let a = if CigarUtils::cigar_consumes_reference_bases(ce) {
1221 ce.len() as i64
1222 } else {
1223 0
1224 };
1225
1226 let b = if CigarUtils::cigar_consumes_read_bases(ce) {
1227 ce.len() as i64
1228 } else {
1229 0
1230 };
1231
1232 a - b
1233 })
1234 .sum::<i64>();
1235
1236 let index_of_ref_node_to_use = index_of_last_dangling_node as i64
1237 + offset_for_ref_end_to_dangling_end
1238 + num_nodes_to_extend as i64;
1239
1240 if index_of_ref_node_to_use >= dangling_head_merge_result.reference_path.len() as i64
1241 || index_of_ref_node_to_use < 0
1242 {
1243 return false;
1244 }
1245
1246 let dangling_source = dangling_head_merge_result
1247 .dangling_path
1248 .remove(index_of_last_dangling_node);
1249 let ref_source_sequence = self
1250 .base_graph
1251 .graph
1252 .node_weight(
1253 dangling_head_merge_result.reference_path[index_of_ref_node_to_use as usize],
1254 )
1255 .unwrap()
1256 .get_sequence();
1257
1258 let mut sequence_to_extend = Vec::new();
1259 for i in 0..num_nodes_to_extend {
1260 sequence_to_extend.push(ref_source_sequence[i])
1261 }
1262 sequence_to_extend.extend_from_slice(
1263 self.base_graph
1264 .graph
1265 .node_weight(dangling_source)
1266 .unwrap()
1267 .get_sequence(),
1268 );
1269 let source_edge = self.get_heaviest_edge(dangling_source, Direction::Outgoing);
1273 let mut prev_v = self.base_graph.get_edge_target(source_edge);
1274 let removed_edge = self.base_graph.graph.remove_edge(source_edge).unwrap();
1275
1276 for i in (1..=num_nodes_to_extend).rev() {
1279 let new_v = MultiDeBruijnVertex::new_with_sequence(
1280 sequence_to_extend[i..i + self.base_graph.get_kmer_size()].to_vec(),
1281 );
1282 let new_v_index = self.base_graph.add_node(&new_v);
1283 let new_e =
1284 MultiSampleEdge::new(false, removed_edge.multiplicity, self.num_pruning_samples);
1285 let _new_e_index = self.base_graph.graph.add_edge(new_v_index, prev_v, new_e);
1286 dangling_head_merge_result.dangling_path.push(new_v_index);
1287 prev_v = new_v_index;
1288 }
1289
1290 return true;
1291 }
1292
1293 fn best_prefix_match(
1304 &self,
1305 cigar_elements: &Vec<Cigar>,
1306 path1: &[u8],
1307 path2: &[u8],
1308 ) -> (i64, i64) {
1309 let min_matching_bases = self.get_min_matching_bases() as i64;
1310
1311 let mut ref_idx = cigar_elements
1312 .iter()
1313 .map(|cigar| {
1314 if CigarUtils::cigar_consumes_reference_bases(cigar) {
1315 cigar.len() as i64
1316 } else {
1317 0
1318 }
1319 })
1320 .sum::<i64>()
1321 - 1;
1322 let mut read_idx = path2.len() as i64 - 1;
1323
1324 'cigarLoop: for ce in cigar_elements.iter().rev() {
1327 if !(CigarUtils::cigar_consumes_reference_bases(ce)
1328 && CigarUtils::cigar_consumes_read_bases(ce))
1329 {
1330 break;
1331 } else {
1332 for _j in 0..(ce.len()) {
1333 if path1[ref_idx as usize] != path2[read_idx as usize] {
1334 break 'cigarLoop;
1335 };
1336 ref_idx -= 1;
1337 read_idx -= 1;
1338 if ref_idx < 0 || read_idx < 0 {
1339 break 'cigarLoop;
1340 }
1341 }
1342 }
1343 }
1344
1345 let matches = path2.len() as i64 - 1 - read_idx;
1346 return if matches < min_matching_bases {
1347 (-1, -1)
1348 } else {
1349 (ref_idx, read_idx)
1350 };
1351 }
1352
1353 fn get_min_matching_bases(&self) -> i32 {
1357 self.min_matching_bases_to_dangling_end_recovery
1358 }
1359
1360 fn generate_cigar_against_downwards_reference_path(
1371 &self,
1372 vertex: NodeIndex,
1373 prune_factor: usize,
1374 min_dangling_branch_length: i32,
1375 recover_all: bool,
1376 dangling_tail_sw_parameters: &Parameters,
1377 ) -> Option<DanglingChainMergeHelper> {
1378 let min_tail_path_length = max(1, min_dangling_branch_length); let alt_path =
1382 self.find_path_upwards_to_lowest_common_ancestor(vertex, prune_factor, !recover_all);
1383 match alt_path {
1384 None => return None,
1385 Some(ref alt_path) => {
1386 if self.base_graph.is_ref_source(alt_path[0])
1387 || (alt_path.len() as i32) < min_tail_path_length + 1
1388 {
1389 return None;
1390 }
1391 }
1392 };
1393
1394 let alt_path = alt_path.unwrap();
1395 let ref_path = self.get_reference_path(
1397 alt_path[0],
1398 TraversalDirection::Downwards,
1399 Some(self.get_heaviest_edge(alt_path[1], Direction::Incoming)),
1400 );
1401
1402 let ref_bases = self.get_bases_for_path(&ref_path, false);
1404 let alt_bases = self.get_bases_for_path(&alt_path, false);
1405
1406 let alignment = SmithWatermanAligner::align(
1409 ref_bases.as_bytes(),
1410 alt_bases.as_bytes(),
1411 dangling_tail_sw_parameters,
1412 OverhangStrategy::LeadingInDel,
1413 self.avx_mode,
1414 );
1415
1416 return Some(DanglingChainMergeHelper::new(
1417 alt_path,
1418 ref_path,
1419 alt_bases,
1420 ref_bases,
1421 AlignmentUtils::remove_trailing_deletions(alignment.cigar),
1422 ));
1423 }
1424
1425 fn generate_cigar_against_upwards_reference_path(
1436 &self,
1437 vertex: NodeIndex,
1438 prune_factor: usize,
1439 min_dangling_branch_length: i32,
1440 recover_all: bool,
1441 dangling_head_sw_parameters: &Parameters,
1442 ) -> Option<DanglingChainMergeHelper> {
1443 let alt_path = self.find_path_downwards_to_highest_common_desendant_of_reference(
1445 vertex,
1446 prune_factor,
1447 !recover_all,
1448 );
1449
1450 match alt_path {
1451 None => return None,
1452 Some(ref alt_path) => {
1453 if self.base_graph.is_ref_sink(alt_path[0])
1454 || (alt_path.len() as i32) < min_dangling_branch_length + 1
1455 {
1456 return None;
1457 }
1458 }
1459 }
1460 let alt_path = alt_path.unwrap();
1461
1462 let ref_path = self.get_reference_path(alt_path[0], TraversalDirection::Upwards, None);
1464
1465 let ref_bases = self.get_bases_for_path(&ref_path, true);
1467 let alt_bases = self.get_bases_for_path(&alt_path, true);
1468
1469 let alignment = SmithWatermanAligner::align(
1472 ref_bases.as_bytes(),
1473 alt_bases.as_bytes(),
1474 dangling_head_sw_parameters,
1475 OverhangStrategy::LeadingInDel,
1476 self.avx_mode,
1477 );
1478
1479 return Some(DanglingChainMergeHelper::new(
1480 alt_path,
1481 ref_path,
1482 alt_bases,
1483 ref_bases,
1484 AlignmentUtils::remove_trailing_deletions(alignment.cigar),
1485 ));
1486 }
1487
1488 fn find_path_downwards_to_highest_common_desendant_of_reference(
1500 &self,
1501 vertex: NodeIndex,
1502 prune_factor: usize,
1503 give_up_at_branch: bool,
1504 ) -> Option<Vec<NodeIndex>> {
1505 return if give_up_at_branch {
1506 let done = |v: NodeIndex| -> bool {
1507 self.base_graph.is_reference_node(v) || self.base_graph.out_degree_of(v) != 1
1508 };
1509
1510 let return_path = |v: NodeIndex| -> bool { self.base_graph.is_reference_node(v) };
1511
1512 let next_edge =
1513 |v: NodeIndex| -> EdgeIndex { self.base_graph.outgoing_edge_of(v).unwrap() };
1514
1515 let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_target(e) };
1516
1517 self.find_path(
1518 vertex,
1519 prune_factor,
1520 &done,
1521 &return_path,
1522 &next_edge,
1523 &next_node,
1524 )
1525 } else {
1526 let done = |v: NodeIndex| -> bool {
1527 self.base_graph.is_reference_node(v) || self.base_graph.out_degree_of(v) == 0
1528 };
1529
1530 let return_path = |v: NodeIndex| -> bool { self.base_graph.is_reference_node(v) };
1531
1532 let next_edge =
1533 |v: NodeIndex| -> EdgeIndex { self.get_heaviest_edge(v, Direction::Outgoing) };
1534
1535 let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_target(e) };
1536
1537 self.find_path(
1538 vertex,
1539 prune_factor,
1540 &done,
1541 &return_path,
1542 &next_edge,
1543 &next_node,
1544 )
1545 };
1546 }
1547
1548 fn get_reference_path(
1557 &self,
1558 start: NodeIndex,
1559 direction: TraversalDirection,
1560 blacklisted_edge: Option<EdgeIndex>,
1561 ) -> Vec<NodeIndex> {
1562 let mut path = Vec::new();
1563
1564 let mut v = Some(start);
1565 while v.is_some() {
1566 path.push(v.unwrap());
1567 v = match direction {
1568 TraversalDirection::Downwards => {
1569 self.base_graph
1570 .get_next_reference_vertex(v, true, blacklisted_edge)
1571 }
1572 TraversalDirection::Upwards => self.base_graph.get_prev_reference_vertex(v),
1573 };
1574 }
1575
1576 return path;
1577 }
1578
1579 fn find_path_upwards_to_lowest_common_ancestor(
1590 &self,
1591 vertex: NodeIndex,
1592 prune_factor: usize,
1593 give_up_at_branch: bool,
1594 ) -> Option<Vec<NodeIndex>> {
1595 return if give_up_at_branch {
1596 let done = |v: NodeIndex| -> bool {
1597 self.base_graph.in_degree_of(v) != 1 || self.base_graph.out_degree_of(v) >= 2
1598 };
1599
1600 let return_path = |v: NodeIndex| -> bool { self.base_graph.out_degree_of(v) > 1 };
1601
1602 let next_edge =
1603 |v: NodeIndex| -> EdgeIndex { self.base_graph.incoming_edge_of(v).unwrap() };
1604
1605 let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_source(e) };
1606
1607 self.find_path(
1608 vertex,
1609 prune_factor,
1610 &done,
1611 &return_path,
1612 &next_edge,
1613 &next_node,
1614 )
1615 } else {
1616 let done = |v: NodeIndex| -> bool {
1617 self.has_incident_ref_edge(v) || self.base_graph.in_degree_of(v) == 0
1618 };
1619
1620 let return_path = |v: NodeIndex| -> bool {
1621 self.base_graph.out_degree_of(v) > 1 && self.has_incident_ref_edge(v)
1622 };
1623
1624 let next_edge =
1625 |v: NodeIndex| -> EdgeIndex { self.get_heaviest_edge(v, Direction::Incoming) };
1626
1627 let next_node = |e: EdgeIndex| -> NodeIndex { self.base_graph.get_edge_source(e) };
1628
1629 self.find_path(
1630 vertex,
1631 prune_factor,
1632 &done,
1633 &return_path,
1634 &next_edge,
1635 &next_node,
1636 )
1637 };
1638 }
1639
1640 fn find_path(
1652 &self,
1653 vertex: NodeIndex,
1654 prune_factor: usize,
1655 done: &dyn Fn(NodeIndex) -> bool,
1656 return_path: &dyn Fn(NodeIndex) -> bool,
1657 next_edge: &dyn Fn(NodeIndex) -> EdgeIndex,
1658 next_node: &dyn Fn(EdgeIndex) -> NodeIndex,
1659 ) -> Option<Vec<NodeIndex>> {
1660 let mut path = Vec::new();
1661 let mut visited_nodes = HashSet::new();
1662
1663 let mut v = vertex;
1664 while !done(v) {
1665 let edge = next_edge(v);
1666 if self
1668 .base_graph
1669 .graph
1670 .edge_weight(edge)
1671 .unwrap()
1672 .get_pruning_multiplicity()
1673 < prune_factor
1674 {
1675 visited_nodes.par_extend(path);
1677 path = Vec::new();
1678 } else {
1679 path.insert(0, v);
1680 }
1681
1682 v = next_node(edge);
1683 if path.contains(&v) || visited_nodes.contains(&v) {
1685 error!("Dangling end recovery killed because of a loop (find_path)");
1686 return None;
1687 }
1688 }
1689 path.insert(0, v);
1690
1691 return if return_path(v) { Some(path) } else { None };
1692 }
1693
1694 fn has_incident_ref_edge(&self, v: NodeIndex) -> bool {
1695 for edge in self.base_graph.graph.edges_directed(v, Direction::Incoming) {
1696 if self.base_graph.edge_is_ref(edge.id()) {
1697 return true;
1698 }
1699 }
1700 return false;
1701 }
1702
1703 fn get_heaviest_edge(&self, v: NodeIndex, direction: Direction) -> EdgeIndex {
1704 let edges = self
1705 .base_graph
1706 .graph
1707 .edges_directed(v, direction)
1708 .map(|e| e.id())
1709 .collect::<Vec<EdgeIndex>>();
1710 return if edges.len() == 1 {
1711 edges[0]
1712 } else {
1713 let comparator = Extract::new(|edge: &EdgeIndex| {
1714 self.base_graph
1715 .graph
1716 .edge_weight(*edge)
1717 .unwrap()
1718 .multiplicity
1719 });
1720
1721 *edges
1722 .iter()
1723 .max_by(|l, r| comparator.compare(l, r))
1724 .unwrap()
1725 };
1726 }
1727
1728 fn get_bases_for_path(&self, path: &Vec<NodeIndex>, expand_source: bool) -> String {
1736 assert!(!path.is_empty(), "Path cannot be empty");
1737
1738 let sb = path
1739 .par_iter()
1740 .map(|v| {
1741 if expand_source && self.base_graph.is_source(*v) {
1742 let mut seq = self
1743 .base_graph
1744 .graph
1745 .node_weight(*v)
1746 .unwrap()
1747 .get_sequence()
1748 .to_vec();
1749 seq.reverse();
1750 std::str::from_utf8(&seq).unwrap().to_string()
1751 } else {
1752 std::str::from_utf8(
1753 self.base_graph
1754 .graph
1755 .node_weight(*v)
1756 .unwrap()
1757 .get_suffix_as_array(),
1758 )
1759 .unwrap()
1760 .to_string()
1761 }
1762 })
1763 .collect::<String>();
1764
1765 return sb;
1766 }
1767
1768 fn remove_paths_not_connected_to_ref(&mut self) {
1769 self.base_graph.remove_paths_not_connected_to_ref()
1770 }
1771
1772 fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct> {
1773 self.base_graph.to_sequence_graph()
1774 }
1775
1776 fn get_kmer_size(&self) -> usize {
1777 self.base_graph.get_kmer_size()
1778 }
1779
1780 fn get_reference_source_vertex(&self) -> Option<NodeIndex> {
1781 self.base_graph.get_reference_source_vertex()
1782 }
1783
1784 fn get_reference_sink_vertex(&self) -> Option<NodeIndex> {
1785 self.base_graph.get_reference_sink_vertex()
1786 }
1787
1788 fn post_process_for_haplotype_finding<L: Locatable>(
1791 &mut self,
1792 _debug_graph_output_path: Option<&String>,
1793 _ref_haplotype: &L,
1794 ) {
1795 }
1797}