lorikeet_genome/read_threading/read_threading_assembler.rs
1use gkl::smithwaterman::{OverhangStrategy, Parameters};
2use hashlink::LinkedHashMap;
3use rayon::prelude::*;
4use rust_htslib::bam::record::{Cigar, CigarString};
5
6// use crate::read_error_corrector::nearby_kmer_error_corrector::NearbyKmerErrorCorrector;
7use crate::assembly::assembly_region::AssemblyRegion;
8use crate::assembly::assembly_result::{AssemblyResult, Status};
9use crate::assembly::assembly_result_set::AssemblyResultSet;
10use crate::graphs::adaptive_chain_pruner::AdaptiveChainPruner;
11use crate::graphs::base_edge::{BaseEdge, BaseEdgeStruct};
12use crate::graphs::base_graph::BaseGraph;
13use crate::graphs::base_vertex::BaseVertex;
14use crate::graphs::chain_pruner::ChainPruner;
15use crate::graphs::graph_based_k_best_haplotype_finder::GraphBasedKBestHaplotypeFinder;
16use crate::graphs::k_best_haplotype::KBestHaplotype;
17use crate::graphs::seq_graph::SeqGraph;
18use crate::graphs::seq_vertex::SeqVertex;
19use crate::haplotype::haplotype::Haplotype;
20use crate::model::byte_array_allele::Allele;
21use crate::pair_hmm::pair_hmm_likelihood_calculation_engine::AVXMode;
22use crate::graphs::low_weight_chain_pruner::LowWeightChainPruner;
23use crate::read_threading::abstract_read_threading_graph::{AbstractReadThreadingGraph, SequenceForKmers};
24use crate::read_threading::read_threading_graph::ReadThreadingGraph;
25use crate::reads::bird_tool_reads::BirdToolRead;
26use crate::reads::cigar_utils::CigarUtils;
27use crate::reads::read_clipper::ReadClipper;
28use crate::utils::simple_interval::{Locatable, SimpleInterval};
29
30const PRUNE_FACTOR_COVERAGE_THRESHOLD: f64 = 10.0;
31
32#[derive(Debug, Clone)]
33pub struct ReadThreadingAssembler {
34 pub(crate) kmer_sizes: Vec<usize>,
35 dont_increase_kmer_sizes_for_cycles: bool,
36 allow_non_unique_kmers_in_ref: bool,
37 generate_seq_graph: bool,
38 // recover_haplotypes_from_edges_not_covered_in_junction_trees: bool,
39 num_pruning_samples: i32,
40 disable_prune_factor_correction: bool, // if the region has many reads, having a low prune factor can cause excessive runtimes
41 num_best_haplotypes_per_graph: i32,
42 prune_before_cycle_counting: bool,
43 remove_paths_not_connected_to_ref: bool,
44 just_return_raw_graph: bool,
45 pub(crate) recover_dangling_branches: bool,
46 pub(crate) recover_all_dangling_branches: bool,
47 pub(crate) min_dangling_branch_length: i32,
48 pub(crate) min_base_quality_to_use_in_assembly: u8,
49 prune_factor: usize,
50 min_matching_bases_to_dangling_end_recovery: i32,
51 chain_pruner: ChainPruner,
52 pub(crate) debug_graph_transformations: bool,
53 pub(crate) debug_graph_output_path: Option<String>,
54 // graph_haplotype_histogram_path: Option<String>,
55 pub(crate) graph_output_path: Option<String>,
56}
57
58impl ReadThreadingAssembler {
59 const DEFAULT_NUM_PATHS_PER_GRAPH: usize = 128;
60 const KMER_SIZE_ITERATION_INCREASE: usize = 13;
61 const MAX_KMER_ITERATIONS_TO_ATTEMPT: usize = 6;
62
63 /**
64 * If false, we will only write out a region around the reference source
65 */
66 // const PRINT_FILL_GRAPH_FOR_DEBUGGING: bool = true;
67 const DEFAULT_MIN_BASE_QUALITY_TO_USE: u8 = 10;
68 const MIN_HAPLOTYPE_REFERENCE_LENGTH: u32 = 30;
69
70 pub fn new(
71 max_allowed_paths_for_read_threading_assembler: i32,
72 mut kmer_sizes: Vec<usize>,
73 dont_increase_kmer_sizes_for_cycles: bool,
74 allow_non_unique_kmers_in_ref: bool,
75 num_pruning_samples: i32,
76 prune_factor: usize,
77 use_adaptive_pruning: bool,
78 initial_error_rate_for_pruning: f64,
79 pruning_log_odds_threshold: f64,
80 pruning_seeding_log_odds_threshold: f64,
81 max_unpruned_variants: usize,
82 _use_linked_debruijn_graphs: bool,
83 enable_legacy_graph_cycle_detection: bool,
84 min_matching_bases_to_dangle_end_recovery: i32,
85 disable_prune_factor_correction: bool,
86 ) -> ReadThreadingAssembler {
87 assert!(
88 max_allowed_paths_for_read_threading_assembler >= 1,
89 "num_best_haplotypes_per_graph should be >= 1 but got {}",
90 max_allowed_paths_for_read_threading_assembler
91 );
92 kmer_sizes.sort_unstable();
93
94 let chain_pruner = if use_adaptive_pruning {
95 ChainPruner::AdaptiveChainPruner(AdaptiveChainPruner::new(
96 initial_error_rate_for_pruning,
97 pruning_log_odds_threshold,
98 pruning_seeding_log_odds_threshold,
99 max_unpruned_variants,
100 ))
101 } else {
102 ChainPruner::LowWeightChainPruner(LowWeightChainPruner::new(prune_factor))
103 };
104
105 // TODO: //!use_linked_debruijn_graphs should be used for generate_seq_graph
106 // but have not yet implement junction tree method
107 ReadThreadingAssembler {
108 kmer_sizes,
109 dont_increase_kmer_sizes_for_cycles,
110 allow_non_unique_kmers_in_ref,
111 num_pruning_samples,
112 prune_factor,
113 chain_pruner,
114 generate_seq_graph: true,
115 prune_before_cycle_counting: !enable_legacy_graph_cycle_detection,
116 remove_paths_not_connected_to_ref: true,
117 just_return_raw_graph: false,
118 recover_dangling_branches: true,
119 recover_all_dangling_branches: false,
120 min_dangling_branch_length: 0,
121 num_best_haplotypes_per_graph: max_allowed_paths_for_read_threading_assembler,
122 min_matching_bases_to_dangling_end_recovery: min_matching_bases_to_dangle_end_recovery,
123 // recover_haplotypes_from_edges_not_covered_in_junction_trees: true,
124 min_base_quality_to_use_in_assembly: Self::DEFAULT_MIN_BASE_QUALITY_TO_USE,
125 debug_graph_transformations: false,
126 debug_graph_output_path: Some(format!("graph_debugging")),
127 // graph_haplotype_histogram_path: None,
128 graph_output_path: None,
129 disable_prune_factor_correction
130 }
131 }
132
133 pub fn default() -> Self {
134 Self::new(
135 Self::DEFAULT_NUM_PATHS_PER_GRAPH as i32,
136 vec![25],
137 true,
138 true,
139 1,
140 2,
141 false,
142 0.001,
143 2.0,
144 2.0,
145 std::usize::MAX,
146 false,
147 false,
148 3,
149 false,
150 )
151 }
152
153 pub fn default_with_kmers(
154 max_allowed_paths_for_read_threading_assembler: i32,
155 kmer_sizes: Vec<usize>,
156 prune_factor: usize,
157 ) -> Self {
158 Self::new(
159 max_allowed_paths_for_read_threading_assembler,
160 kmer_sizes,
161 true,
162 true,
163 1,
164 prune_factor,
165 false,
166 0.001,
167 2.0,
168 2.0,
169 std::usize::MAX,
170 false,
171 false,
172 3,
173 false
174 )
175 }
176
177 pub fn set_just_return_raw_graph(&mut self, value: bool) {
178 self.just_return_raw_graph = value;
179 }
180
181 pub fn set_remove_paths_not_connected_to_ref(&mut self, value: bool) {
182 self.remove_paths_not_connected_to_ref = value;
183 }
184
185 pub fn set_recover_dangling_branches(&mut self, value: bool) {
186 self.recover_dangling_branches = value;
187 }
188
189 fn set_prune_factor(&mut self, value: usize) {
190 self.prune_factor = value;
191 self.chain_pruner.set_prune_factor(value);
192 }
193 /**
194 * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
195 * @param assemblyRegion AssemblyRegion object holding the reads which are to be used during assembly
196 * @param refHaplotype reference haplotype object
197 * @param fullReferenceWithPadding byte array holding the reference sequence with padding
198 * @param refLoc GenomeLoc object corresponding to the reference sequence with padding
199 * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
200 * @param aligner {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence
201 * @return the resulting assembly-result-set
202 */
203 pub fn run_local_assembly<'b>(
204 &mut self,
205 mut assembly_region: AssemblyRegion,
206 ref_haplotype: &'b mut Haplotype<SimpleInterval>,
207 full_reference_with_padding: Vec<u8>,
208 ref_loc: SimpleInterval,
209 // read_error_corrector: Option<C>,
210 sample_names: &'b [String],
211 dangling_end_sw_parameters: Parameters,
212 reference_to_haplotype_sw_parameters: Parameters,
213 avx_mode: AVXMode,
214 additional_kmer_sizes: Option<Vec<usize>>
215 ) -> AssemblyResultSet<ReadThreadingGraph> {
216 assert!(
217 full_reference_with_padding.len() == ref_loc.size(),
218 "Reference bases and reference loc must be the same size. {} -> {}",
219 full_reference_with_padding.len(),
220 ref_loc.size()
221 );
222
223 // Note that error correction does not modify the original reads,
224 // which are used for genotyping TODO this might come before error correction /
225 // let mut corrected_reads = assembly_region.get_reads_cloned();
226 // match read_error_corrector {
227 // // TODO: Is it possible to perform this
228 // // without cloning? Perhaps get_reads() should just return ownership of reads?
229 // // Can't move reads out of assembly region as they are required later on during
230 // // read threading phase. Very annoying
231 // None => assembly_region.get_reads_cloned(),
232 // Some(mut read_error_corrector) => {
233 // read_error_corrector.correct_reads(assembly_region.get_reads_cloned())
234 // }
235 // };
236
237 // Revert clipped bases if necessary (since we do not want to assemble them)
238 let corrected_reads = assembly_region.move_reads();
239 let corrected_reads = corrected_reads
240 .into_par_iter()
241 .map(|read| ReadClipper::new(read).hard_clip_soft_clipped_bases())
242 .collect::<Vec<BirdToolRead>>();
243
244 // calculate coverage estimate. no. reads / region size
245 let old_prune_factor = self.prune_factor;
246 if !self.disable_prune_factor_correction && !self.chain_pruner.is_adaptive() {
247 let coverage = assembly_region.calculate_coverage(&corrected_reads);
248 // debug!("Coverage {} read count {} region size {}", coverage, corrected_reads.len(), assembly_region.get_span().size());
249 let new_prune_factor = if coverage > PRUNE_FACTOR_COVERAGE_THRESHOLD {
250 2
251 } else {
252 0
253 };
254 self.set_prune_factor(new_prune_factor);
255 }
256
257
258 // debug!("Corrected reads {}", corrected_reads.len());
259 // let non_ref_rt_graphs: Vec<ReadThreadingGraph> = Vec::new();
260 // let non_ref_seq_graphs: Vec<SeqGraph<BaseEdgeStruct>> = Vec::new();
261 let active_region_extended_location = assembly_region.get_padded_span();
262 ref_haplotype.set_genome_location(active_region_extended_location.clone());
263
264 let mut result_set = AssemblyResultSet::new(
265 assembly_region,
266 full_reference_with_padding,
267 ref_loc.clone(),
268 ref_haplotype.clone(),
269 );
270
271 // either follow the old method for building graphs and then assembling or assemble and haplotype call before expanding kmers
272 if self.generate_seq_graph {
273 self.assemble_kmer_graphs_and_haplotype_call(
274 &ref_haplotype,
275 &ref_loc,
276 &corrected_reads,
277 &mut result_set,
278 &active_region_extended_location,
279 sample_names,
280 &dangling_end_sw_parameters,
281 &reference_to_haplotype_sw_parameters,
282 avx_mode,
283 additional_kmer_sizes
284 );
285 } else {
286 self.assemble_graphs_and_expand_kmers_given_haplotypes(
287 &ref_haplotype,
288 &ref_loc,
289 &corrected_reads,
290 &mut result_set,
291 &active_region_extended_location,
292 sample_names,
293 &dangling_end_sw_parameters,
294 &reference_to_haplotype_sw_parameters,
295 avx_mode,
296 additional_kmer_sizes
297 )
298 }
299
300 // reset prune_factor
301 self.set_prune_factor(old_prune_factor);
302
303 // If we get to this point then no graph worked... thats bad and indicates something
304 // horrible happened, in this case we just return a reference haplotype
305 result_set.region_for_genotyping.reads = corrected_reads;
306 // debug!(
307 // "Found {} to compare every read against",
308 // result_set.haplotypes.len()
309 // );
310 result_set
311 }
312
313 /**
314 * Follow the old behavior, call into {@link #assemble(List, Haplotype, SAMFileHeader, SmithWatermanAligner)} to decide if a graph
315 * is acceptable for haplotype discovery then detect haplotypes.
316 */
317 fn assemble_kmer_graphs_and_haplotype_call<'b>(
318 &mut self,
319 ref_haplotype: &'b Haplotype<SimpleInterval>,
320 ref_loc: &'b SimpleInterval,
321 corrected_reads: &'b Vec<BirdToolRead>,
322 // non_ref_seq_graphs: &mut Vec<SeqGraph<BaseEdgeStruct>>,
323 result_set: &mut AssemblyResultSet<ReadThreadingGraph>,
324 active_region_extended_location: &'b SimpleInterval,
325 sample_names: &'b [String],
326 dangling_end_sw_parameters: &Parameters,
327 reference_to_haplotype_sw_parameters: &Parameters,
328 avx_mode: AVXMode,
329 additional_kmer_sizes: Option<Vec<usize>>
330 ) {
331 // create the graphs by calling our subclass assemble method
332 self.assemble(
333 &corrected_reads,
334 ref_haplotype,
335 sample_names,
336 dangling_end_sw_parameters,
337 avx_mode,
338 additional_kmer_sizes
339 )
340 .into_iter()
341 .for_each(|mut result| {
342 // debug!("graph after assembly {:?}", &result.graph.as_ref().unwrap().base_graph);
343 // debug!(
344 // "Result loc {:?} Status {:?} haps {:?}",
345 // active_region_extended_location, &result.status, &result.discovered_haplotypes
346 // );
347
348 if result.status == Status::AssembledSomeVariation {
349 // do some QC on the graph
350 Self::sanity_check_graph(&result.graph.as_ref().unwrap().base_graph, ref_haplotype);
351 // add it to graphs with meaningful non-reference features
352 self.find_best_path(
353 &mut result,
354 ref_haplotype,
355 ref_loc,
356 active_region_extended_location,
357 reference_to_haplotype_sw_parameters,
358 result_set,
359 avx_mode,
360 );
361 // non_ref_seq_graphs.push(result.graph.unwrap());
362 // result_set.add_haplotype(result);
363 }
364 });
365 }
366
367 /**
368 * Given reads and a reference haplotype give us graphs to use for constructing
369 * non-reference haplotypes.
370 *
371 * @param reads the reads we're going to assemble
372 * @param refHaplotype the reference haplotype
373 * @param aligner {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence
374 * @return a non-null list of reads
375 */
376 pub fn assemble<'b>(
377 &mut self,
378 reads: &'b Vec<BirdToolRead>,
379 ref_haplotype: &'b Haplotype<SimpleInterval>,
380 sample_names: &'b [String],
381 dangling_end_sw_parameters: &Parameters,
382 avx_mode: AVXMode,
383 additional_kmer_sizes: Option<Vec<usize>>,
384 ) -> Vec<AssemblyResult<SimpleInterval, ReadThreadingGraph>> {
385
386 let mut kmer_sizes = self.kmer_sizes.clone();
387 if let Some(additional) = additional_kmer_sizes {
388 kmer_sizes.extend(additional);
389 }
390
391 // try using the requested kmer sizes
392 let mut results = kmer_sizes
393 .par_iter()
394 .filter_map(|kmer_size| {
395 self.create_graph(
396 reads,
397 ref_haplotype,
398 *kmer_size,
399 self.dont_increase_kmer_sizes_for_cycles,
400 self.allow_non_unique_kmers_in_ref,
401 sample_names,
402 dangling_end_sw_parameters,
403 avx_mode,
404 )
405 // {
406 // None => continue,
407 // Some(assembly_result) => {
408 // debug!(
409 // "Found assembly result (No increase) graph -> {:?}",
410 // assembly_result.graph.as_ref().unwrap().base_graph
411 // );
412 // results.push(assembly_result)
413 // }
414 // }
415 })
416 .collect::<Vec<AssemblyResult<SimpleInterval, ReadThreadingGraph>>>();
417
418
419 if results.is_empty() && !self.dont_increase_kmer_sizes_for_cycles {
420 let mut kmer_size =
421 *self.kmer_sizes.iter().max().unwrap() + Self::KMER_SIZE_ITERATION_INCREASE;
422 // if kmer_size is even, add 1 to make it odd
423 if kmer_size % 2 == 0 {
424 kmer_size += 1;
425 }
426 let mut num_iterations = 1;
427 while results.is_empty() && num_iterations <= Self::MAX_KMER_ITERATIONS_TO_ATTEMPT {
428 // on the last attempt we will allow low complexity graphs
429 let last_attempt = num_iterations == Self::MAX_KMER_ITERATIONS_TO_ATTEMPT;
430 match self.create_graph(
431 reads,
432 &ref_haplotype,
433 kmer_size,
434 last_attempt,
435 last_attempt,
436 sample_names,
437 dangling_end_sw_parameters,
438 avx_mode,
439 ) {
440 None => {
441 // pass
442 }
443 Some(assembly_result) => {
444 results.push(assembly_result);
445 }
446 };
447 kmer_size += Self::KMER_SIZE_ITERATION_INCREASE;
448 num_iterations += 1;
449 }
450 }
451
452 return results;
453 }
454
455 /**
456 * Follow the kmer expansion heurisics as {@link #assemble(List, Haplotype, SAMFileHeader, SmithWatermanAligner)}, but in this case
457 * attempt to recover haplotypes from the kmer graph and use them to assess whether to expand the kmer size.
458 */
459 fn assemble_graphs_and_expand_kmers_given_haplotypes<'b>(
460 &mut self,
461 ref_haplotype: &'b Haplotype<SimpleInterval>,
462 ref_loc: &'b SimpleInterval,
463 corrected_reads: &'b Vec<BirdToolRead>,
464 result_set: &mut AssemblyResultSet<ReadThreadingGraph>,
465 active_region_extended_location: &'b SimpleInterval,
466 sample_names: &'b [String],
467 dangling_end_sw_parameters: &Parameters,
468 reference_to_haplotype_sw_parameters: &Parameters,
469 avx_mode: AVXMode,
470 additional_kmer_sizes: Option<Vec<usize>>
471 ) {
472 let mut saved_assembly_results = Vec::new();
473
474 let mut has_adequately_assembled_graph = false;
475 let kmers_to_try = self.get_expanded_kmer_list(additional_kmer_sizes);
476 // first, try using the requested kmer sizes
477 for i in 0..kmers_to_try.len() {
478 let kmer_size = kmers_to_try[i];
479 let is_last_cycle = i == kmers_to_try.len() - 1;
480 if !has_adequately_assembled_graph {
481 let assembled_result = self.create_graph(
482 corrected_reads,
483 &ref_haplotype,
484 kmer_size,
485 is_last_cycle || self.dont_increase_kmer_sizes_for_cycles,
486 is_last_cycle || self.allow_non_unique_kmers_in_ref,
487 &sample_names,
488 dangling_end_sw_parameters,
489 avx_mode,
490 );
491 match assembled_result {
492 None => {} //pass
493 Some(mut assembled_result) => {
494 if assembled_result.status == Status::AssembledSomeVariation {
495 // do some QC on the graph
496 Self::sanity_check_graph(
497 assembled_result
498 .threading_graph
499 .as_ref()
500 .unwrap()
501 .get_base_graph(),
502 &ref_haplotype,
503 );
504 let _ = &mut assembled_result
505 .threading_graph
506 .as_mut()
507 .unwrap()
508 .post_process_for_haplotype_finding(
509 self.debug_graph_output_path.as_ref(),
510 ref_haplotype.genome_location.as_ref().unwrap(),
511 );
512 // add it to graphs with meaningful non-reference features
513 // non_ref_rt_graphs.push(assembled_result.threading_graph.unwrap().clone());
514 // if graph
515 // TODO: Add histogram plotting
516 // let graph = assembled_result.threading_graph.as_ref().unwrap();
517 self.find_best_path(
518 &mut assembled_result,
519 ref_haplotype,
520 ref_loc,
521 active_region_extended_location,
522 reference_to_haplotype_sw_parameters,
523 result_set,
524 avx_mode,
525 );
526
527 saved_assembly_results.push(assembled_result);
528 //TODO LOGIC PLAN HERE - we want to check if we have a trustworthy graph (i.e. no badly assembled haplotypes) if we do, emit it.
529 //TODO - but if we failed to assemble due to excessive looping or did have badly assembled haplotypes then we expand kmer size.
530 //TODO - If we get no variation
531
532 // if asssembly didn't fail ( which is a degenerate case that occurs for some subset of graphs with difficult loop
533 if !saved_assembly_results
534 .last()
535 .unwrap()
536 .discovered_haplotypes
537 .is_empty()
538 {
539 // we have found our workable kmer size so lets add the results and finish
540 let assembled_result = saved_assembly_results.last().unwrap();
541 if !assembled_result.contains_suspect_haploptypes {
542 // let mut result_set = result_set.lock().unwrap();
543 for h in assembled_result.discovered_haplotypes.clone() {
544 result_set.add_haplotype(h);
545 }
546
547 has_adequately_assembled_graph = true;
548 }
549 }
550 } else if assembled_result.status == Status::JustAssembledReference {
551 has_adequately_assembled_graph = true;
552 }
553 }
554 }
555 }
556 }
557
558 // This indicates that we have thrown everything away... we should go back and
559 // check that we weren't too conservative about assembly results that might
560 // otherwise be good
561 if !has_adequately_assembled_graph {
562 // search for the last haplotype set that had any results, if none are found just return me
563 // In this case we prefer the last meaningful kmer size if possible
564 // for result in saved_assembly_results.
565 saved_assembly_results.reverse();
566 for result in saved_assembly_results {
567 if result.discovered_haplotypes.len() > 1 {
568 // let mut result_set = result_set.lock().unwrap();
569 let ar_index = result_set.add_assembly_result(result);
570 for h in result_set.assembly_results[ar_index]
571 .discovered_haplotypes
572 .clone()
573 {
574 result_set.add_haplotype_and_assembly_result(h, ar_index);
575 }
576 break;
577 }
578 }
579 }
580 }
581
582 /**
583 * Make sure the reference sequence is properly represented in the provided graph
584 *
585 * @param graph the graph to check
586 * @param refHaplotype the reference haplotype
587 */
588 fn sanity_check_graph<'b, V: BaseVertex, E: BaseEdge>(
589 graph: &'b BaseGraph<V, E>,
590 ref_haplotype: &'b Haplotype<SimpleInterval>,
591 ) {
592 let ref_source_vertex = graph.get_reference_source_vertex();
593 let ref_sink_vertex = graph.get_reference_sink_vertex();
594 if ref_source_vertex.is_none() {
595 panic!("All reference graphs must have a reference source vertex");
596 };
597
598 if ref_sink_vertex.is_none() {
599 panic!("All reference graphs must have a reference sink vertex");
600 };
601
602 if graph.get_reference_bytes(ref_source_vertex.unwrap(), ref_sink_vertex, true, true)
603 != ref_haplotype.get_bases()
604 {
605 panic!(
606 "Mismatch between the reference haplotype and the reference assembly graph path for \n\
607 +++++++++ \n\
608 graph = {} \n\
609 haplotype = {} \n\
610 loc = {:?} \n\
611 +++++++++ \n",
612 std::str::from_utf8(
613 graph
614 .get_reference_bytes(
615 ref_source_vertex.unwrap(),
616 ref_sink_vertex,
617 true,
618 true
619 )
620 .as_slice()
621 )
622 .unwrap(),
623 std::str::from_utf8(ref_haplotype.get_bases()).unwrap(),
624 &ref_haplotype.genome_location
625 );
626 };
627 }
628
629 /**
630 * Method for getting a list of all of the specified kmer sizes to test for the graph including kmer expansions
631 * @return
632 */
633 fn get_expanded_kmer_list(&self, additional_kmer_sizes: Option<Vec<usize>>) -> Vec<usize> {
634 let mut return_list = Vec::new();
635 return_list.extend(self.kmer_sizes.iter());
636 if !self.dont_increase_kmer_sizes_for_cycles {
637 let mut kmer_size =
638 self.kmer_sizes.iter().max().unwrap() + Self::KMER_SIZE_ITERATION_INCREASE;
639 let mut num_iterations = 1;
640 while num_iterations <= Self::MAX_KMER_ITERATIONS_TO_ATTEMPT {
641 return_list.push(kmer_size);
642 kmer_size += Self::KMER_SIZE_ITERATION_INCREASE;
643 num_iterations += 1;
644 }
645 }
646
647 if let Some(additional_kmer_sizes) = additional_kmer_sizes {
648 return_list.extend(additional_kmer_sizes.iter());
649 }
650
651 return return_list;
652 }
653
654 /**
655 * Print graph to file NOTE this requires that debugGraphTransformations be enabled.
656 *
657 * @param graph the graph to print
658 * @param fileName the name to give the graph file
659 */
660 fn print_debug_graph_transform_abstract<A: AbstractReadThreadingGraph>(
661 &self,
662 graph: &A,
663 file_name: String,
664 ) {
665 // if Self::PRINT_FILL_GRAPH_FOR_DEBUGGING {
666 // graph.print_graph(file_name, self.prune_factor as usize)
667 // } else {
668 // grap
669 // }
670 graph.print_graph(file_name, self.prune_factor as usize)
671 }
672
673 /**
674 * Print graph to file NOTE this requires that debugGraphTransformations be enabled.
675 *
676 * @param graph the graph to print
677 * @param fileName the name to give the graph file
678 */
679 fn print_debug_graph_transform_seq_graph<E: BaseEdge>(
680 &self,
681 graph: &SeqGraph<E>,
682 file_name: String,
683 ) {
684 // if Self::PRINT_FILL_GRAPH_FOR_DEBUGGING {
685 // graph.print_graph(file_name, self.prune_factor as usize)
686 // } else {
687 // grap
688 // }
689 graph
690 .base_graph
691 .print_graph(&file_name, true, self.prune_factor as usize)
692 }
693
694 /**
695 * Find discover paths by using KBestHaplotypeFinder over each graph.
696 *
697 * This method has the side effect that it will annotate all of the AssemblyResults objects with the derived haplotypes
698 * which can be used for basing kmer graph pruning on the discovered haplotypes.
699 *
700 * @param graph graph to be used for kmer detection
701 * @param assemblyResults assembly results objects for this graph
702 * @param refHaplotype reference haplotype
703 * @param refLoc location of reference haplotype
704 * @param activeRegionWindow window of the active region (without padding)
705 * @param resultSet (can be null) the results set into which to deposit discovered haplotypes
706 * @param aligner SmithWaterman aligner to use for aligning the discovered haplotype to the reference haplotype
707 * @return A list of discovered haplotyes (note that this is not currently used for anything)
708 */
709 fn find_best_path<'b, A: AbstractReadThreadingGraph>(
710 &self,
711 // graph: &BaseGraph<V, E>,
712 assembly_result: &'b mut AssemblyResult<SimpleInterval, A>,
713 ref_haplotype: &'b Haplotype<SimpleInterval>,
714 _ref_loc: &'b SimpleInterval,
715 active_region_window: &'b SimpleInterval,
716 haplotype_to_reference_sw_parameters: &Parameters,
717 result_set: &mut AssemblyResultSet<A>,
718 avx_mode: AVXMode,
719 ) {
720 // add the reference haplotype separately from all the others to ensure
721 // that it is present in the list of haplotypes
722 // let mut return_haplotypes = LinkedHashSet::new();
723 let active_region_start = ref_haplotype.alignment_start_hap_wrt_ref;
724 let mut failed_cigars = 0;
725 {
726 // Validate that the graph is valid with extant source and sink before operating
727 let source = assembly_result
728 .graph
729 .as_ref()
730 .unwrap()
731 .base_graph
732 .get_reference_source_vertex();
733 let sink = assembly_result
734 .graph
735 .as_ref()
736 .unwrap()
737 .base_graph
738 .get_reference_sink_vertex();
739 assert!(
740 source.is_some() && sink.is_some(),
741 "Both source and sink cannot be null"
742 );
743
744 let k_best_haplotypes: Box<Vec<KBestHaplotype>> = if self.generate_seq_graph {
745 Box::new(
746 GraphBasedKBestHaplotypeFinder::new_from_singletons(
747 &mut assembly_result.graph.as_mut().unwrap().base_graph,
748 source.unwrap(),
749 sink.unwrap(),
750 )
751 .find_best_haplotypes(
752 self.num_best_haplotypes_per_graph as usize,
753 &assembly_result.graph.as_ref().unwrap().base_graph,
754 ),
755 )
756 } else {
757 // TODO: JunctionTreeKBestHaplotype looks munted and I haven't implemented the other
758 // JunctionTree stuff so skipping for now
759 panic!("JunctionTree not yet supported, please set generate_seq_graph to true")
760 };
761
762 for k_best_haplotype in k_best_haplotypes.into_iter() {
763 // TODO for now this seems like the solution, perhaps in the future it will be to excise the haplotype completely)
764 // TODO: Lorikeet note, some weird Java shit happens here, will need a work around when
765 // junction tree is implemented
766 let mut h =
767 k_best_haplotype.haplotype(&assembly_result.graph.as_ref().unwrap().base_graph);
768 h.kmer_size = k_best_haplotype.kmer_size;
769
770 if !result_set.haplotypes.contains(&h) {
771 // debug!(
772 // "Potential location {:?} potential haplotype {:?}",
773 // active_region_window, &h
774 // );
775 // TODO this score seems to be irrelevant at this point...
776 // if k_best_haplotype.is_reference {
777 // ref_haplotype.score = OrderedFloat(k_best_haplotype.score);
778 // };
779
780 // debug!("+++++++++==================================== Candidates ====================================+++++++++");
781 // debug!(
782 // "ref -> {}",
783 // std::str::from_utf8(ref_haplotype.get_bases()).unwrap()
784 // );
785 // debug!("alt -> {}", std::str::from_utf8(h.get_bases()).unwrap());
786 // debug!("+++++++++====================================++++++++++++====================================+++++++++");
787 let cigar = CigarUtils::calculate_cigar(
788 ref_haplotype.get_bases(),
789 h.get_bases(),
790 OverhangStrategy::SoftClip,
791 haplotype_to_reference_sw_parameters,
792 avx_mode,
793 );
794
795 match cigar {
796 None => {
797 failed_cigars += 1;
798 continue;
799 }
800 Some(cigar) => {
801 if cigar.is_empty() {
802 panic!(
803 "Smith-Waterman alignment failure. Cigar = {:?}, with reference \
804 length {} but expecting reference length of {}",
805 &cigar, CigarUtils::get_reference_length(&cigar),
806 CigarUtils::get_reference_length(&ref_haplotype.cigar)
807 )
808 } else if Self::path_is_too_divergent_from_reference(&cigar)
809 || CigarUtils::get_reference_length(&cigar)
810 < Self::MIN_HAPLOTYPE_REFERENCE_LENGTH
811 {
812 // N cigar elements means that a bubble was too divergent from the reference so skip over this path
813 continue;
814 } else if CigarUtils::get_reference_length(&cigar)
815 != CigarUtils::get_reference_length(&ref_haplotype.cigar)
816 {
817 // the SOFTCLIP strategy can produce a haplotype cigar that matches the beginning of the reference and
818 // skips the latter part of the reference. For example, when padded haplotype = NNNNNNNNNN[sequence 1]NNNNNNNNNN
819 // and padded ref = NNNNNNNNNN[sequence 1][sequence 2]NNNNNNNNNN, the alignment may choose to align only sequence 1.
820 // If aligning with an indel strategy produces a cigar with deletions for sequence 2 (which is reflected in the
821 // reference length of the cigar matching the reference length of the ref haplotype), then the assembly window was
822 // simply too small to reliably resolve the deletion; it should only throw an IllegalStateException when aligning
823 // with the INDEL strategy still produces discrepant reference lengths.
824 // You might wonder why not just use the INDEL strategy from the beginning. This is because the SOFTCLIP strategy only fails
825 // when there is insufficient flanking sequence to resolve the cigar unambiguously. The INDEL strategy would produce
826 // valid but most likely spurious indel cigars.
827 let cigar_with_indel_strategy = CigarUtils::calculate_cigar(
828 ref_haplotype.get_bases(),
829 h.get_bases(),
830 OverhangStrategy::InDel,
831 haplotype_to_reference_sw_parameters,
832 avx_mode,
833 );
834
835 match cigar_with_indel_strategy {
836 None => panic!("Smith-Waterman Alignment failure. No result"),
837 Some(cigar_with_indel_strategy) => {
838 if CigarUtils::get_reference_length(
839 &cigar_with_indel_strategy,
840 ) == CigarUtils::get_reference_length(
841 &ref_haplotype.cigar,
842 ) {
843 failed_cigars += 1;
844 continue;
845 } else {
846 panic!(
847 "Smith-Waterman alignment failure. Cigar = {:?} with \
848 reference length {} but expecting reference length of \
849 {} ref = {:?} path = {:?}", &cigar,
850 CigarUtils::get_reference_length(&cigar),
851 CigarUtils::get_reference_length(&ref_haplotype.cigar),
852 std::str::from_utf8(ref_haplotype.get_bases()),
853 std::str::from_utf8(h.get_bases()),
854 )
855 }
856 }
857 }
858 }
859
860 h.cigar = cigar;
861 h.alignment_start_hap_wrt_ref = active_region_start;
862 h.genome_location = Some(active_region_window.clone());
863 // debug!(
864 // "Adding haplotype {:?} from graph with kmer {}",
865 // &h.cigar,
866 // assembly_result
867 // .graph
868 // .as_ref()
869 // .unwrap()
870 // .base_graph
871 // .get_kmer_size()
872 // );
873 // return_haplotypes.insert(h.clone());
874 // result set would get added to here
875 // let mut result_set = result_set.lock().unwrap();
876 result_set.add_haplotype(h);
877 }
878 }
879 }
880 }
881 }
882
883 // Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as
884 // the first returned by any finder.
885 // HERE we want to preserve the signal that assembly failed completely so in this case we don't add anything to the empty list
886 // if !result_set.haplotypes.is_empty() && !result_set.haplotypes.contains(ref_haplotype) {
887 // return_haplotypes.insert(ref_haplotype.clone());
888 // };
889
890 if failed_cigars != 0 {
891 // debug!(
892 // "Failed to align some haplotypes ({}) back to the reference (loc={:?}); \
893 // these will be ignored",
894 // failed_cigars, ref_loc
895 // )
896 }
897
898 // assembly_result.set_discovered_haplotypes(return_haplotypes);
899 }
900
901 /**
902 * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
903 * @param c the cigar to test
904 * @return true if we should skip over this path
905 */
906 fn path_is_too_divergent_from_reference(c: &CigarString) -> bool {
907 return c.0.iter().any(|ce| match ce {
908 Cigar::RefSkip(_) => true,
909 _ => false,
910 });
911 }
912
913 /**
914 * Creates the sequence graph for the given kmerSize
915 *
916 * @param reads reads to use
917 * @param refHaplotype reference haplotype
918 * @param kmerSize kmer size
919 * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
920 * @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
921 * @param aligner {@link SmithWatermanAligner} used to align dangling ends to the reference sequence
922 * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
923 */
924 fn create_graph<'b>(
925 &self,
926 reads: &'b Vec<BirdToolRead>,
927 ref_haplotype: &'b Haplotype<SimpleInterval>,
928 kmer_size: usize,
929 allow_low_complexity_graphs: bool,
930 _allow_non_unique_kmers_in_ref: bool,
931 sample_names: &'b [String],
932 dangling_end_sw_parameters: &Parameters,
933 avx_mode: AVXMode,
934 ) -> Option<AssemblyResult<SimpleInterval, ReadThreadingGraph>> {
935 if ref_haplotype.len() < kmer_size {
936 // happens in cases where the assembled region is just too small
937 return Some(AssemblyResult::new(Status::Failed, None, None));
938 }
939
940 if !self.allow_non_unique_kmers_in_ref
941 && !ReadThreadingGraph::determine_non_unique_kmers(
942 &SequenceForKmers::new(
943 "ref".to_string(),
944 ref_haplotype.get_bases(),
945 0,
946 ref_haplotype.get_bases().len(),
947 1,
948 true,
949 ),
950 kmer_size,
951 )
952 .is_empty()
953 {
954 // debug!("Not using kmer size of {kmer_size} in read threading assembler because reference contains non-unique kmers");
955 return None;
956 }
957
958 let mut rt_graph =
959 // if self.generate_seq_graph {
960 ReadThreadingGraph::new(
961 kmer_size,
962 false,
963 self.min_base_quality_to_use_in_assembly,
964 self.num_pruning_samples as usize,
965 self.min_matching_bases_to_dangling_end_recovery,
966 avx_mode
967 );
968 // } else {
969 // // This is where the junction tree debruijn graph would go but considering it is experimental
970 // // we will leave it out for now
971 // ReadThreadingGraph::new(
972 // kmer_size,
973 // false,
974 // self.min_base_quality_to_use_in_assembly,
975 // self.num_pruning_samples as usize,
976 // self.min_matching_bases_to_dangling_end_recovery,
977 // )
978 // };
979
980 rt_graph.set_threading_start_only_at_existing_vertex(!self.recover_dangling_branches);
981
982 // add the reference sequence to the graph
983 let mut pending = LinkedHashMap::new();
984 rt_graph.add_sequence(
985 &mut pending,
986 "ref".to_string(),
987 // ReadThreadingGraph::ANONYMOUS_SAMPLE,
988 std::usize::MAX,
989 ref_haplotype.get_bases(),
990 0,
991 ref_haplotype.get_bases().len(),
992 1,
993 true,
994 );
995 // debug!(
996 // "1 - Graph Kmer {} Edges {} Nodes {}",
997 // kmer_size,
998 // rt_graph.base_graph.graph.edge_count(),
999 // rt_graph.base_graph.graph.node_count()
1000 // );
1001
1002 // Next pull kmers out of every read and throw them on the graph
1003 // debug!("1.5 - Reads {}", reads.len());
1004 let mut count = 0;
1005
1006 let mut sample_count = LinkedHashMap::new();
1007 // let mut read_debugging = false;
1008 for read in reads {
1009 let s_count = sample_count.entry(read.sample_index).or_insert(0);
1010 *s_count += 1;
1011 // if read.name() == b"DFDW01000005.1-5" {
1012 // // debug!("Read {:?}", read);
1013 // read_debugging = true;
1014 // };
1015 rt_graph.add_read(read, sample_names, &mut count, &mut pending)
1016 }
1017 // debug!("1.5 - Count {} -> {:?}", count, sample_count);
1018 // let pending = rt_graph.get_pending(); // retrieve pending sequences and clear pending from graph
1019 // actually build the read threading graph
1020 rt_graph.build_graph_if_necessary(&mut pending);
1021 // debug!(
1022 // "2 - Graph Kmer {} Edges {} Nodes {}",
1023 // kmer_size,
1024 // rt_graph.base_graph.graph.edge_count(),
1025 // rt_graph.base_graph.graph.node_count()
1026 // );
1027
1028 if self.debug_graph_transformations {
1029 self.print_debug_graph_transform_abstract(
1030 &rt_graph,
1031 format!(
1032 "{}_{}-{}-sequenceGraph.{}.0.0.raw_threading_graph.dot",
1033 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1034 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1035 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1036 kmer_size
1037 ),
1038 )
1039 }
1040
1041 // It's important to prune before recovering dangling ends so that we don't waste time recovering bad ends.
1042 // It's also important to prune before checking for cycles so that sequencing errors don't create false cycles
1043 // and unnecessarily abort assembly
1044 if self.prune_before_cycle_counting {
1045 self.chain_pruner
1046 .prune_low_weight_chains(rt_graph.get_base_graph_mut());
1047 }
1048 // debug!(
1049 // "3 - Graph Kmer {} Edges {} Nodes {}",
1050 // kmer_size,
1051 // rt_graph.base_graph.graph.edge_count(),
1052 // rt_graph.base_graph.graph.node_count()
1053 // );
1054
1055 // sanity check: make sure there are no cycles in the graph, unless we are in experimental mode
1056 if self.generate_seq_graph && rt_graph.has_cycles() {
1057 // debug!(
1058 // "Not using kmer size of {} in read threading assembler \
1059 // because it contains a cycle",
1060 // kmer_size
1061 // );
1062 return None;
1063 }
1064
1065 // sanity check: make sure the graph had enough complexity with the given kmer
1066 if !allow_low_complexity_graphs && rt_graph.is_low_quality_graph() {
1067 // debug!(
1068 // "Not using kmer size of {} in read threading assembler because it does not \
1069 // produce a graph with enough complexity",
1070 // kmer_size
1071 // );
1072 return None;
1073 }
1074
1075 let result = self.get_assembly_result(
1076 ref_haplotype,
1077 kmer_size,
1078 rt_graph,
1079 dangling_end_sw_parameters,
1080 );
1081 // check whether recovering dangling ends created cycles
1082 if self.recover_all_dangling_branches
1083 && result.threading_graph.as_ref().unwrap().has_cycles()
1084 {
1085 return None;
1086 }
1087
1088 return Some(result);
1089 }
1090
1091 fn get_assembly_result<A: AbstractReadThreadingGraph>(
1092 &self,
1093 ref_haplotype: &Haplotype<SimpleInterval>,
1094 kmer_size: usize,
1095 mut rt_graph: A,
1096 dangling_end_sw_parameters: &Parameters,
1097 ) -> AssemblyResult<SimpleInterval, A> {
1098 if !self.prune_before_cycle_counting {
1099 self.chain_pruner
1100 .prune_low_weight_chains(rt_graph.get_base_graph_mut())
1101 }
1102
1103 if self.debug_graph_transformations {
1104 self.print_debug_graph_transform_abstract(
1105 &rt_graph,
1106 format!(
1107 "{}_{}-{}-sequenceGraph.{}.0.1.chain_pruned_readthreading_graph.dot",
1108 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1109 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1110 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1111 kmer_size
1112 ),
1113 );
1114 };
1115
1116 // look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if
1117 // we can recover them by merging some N bases from the chain back into the reference
1118 if self.recover_dangling_branches {
1119 rt_graph.recover_dangling_tails(
1120 self.prune_factor as usize,
1121 self.min_dangling_branch_length,
1122 self.recover_all_dangling_branches,
1123 dangling_end_sw_parameters,
1124 );
1125 rt_graph.recover_dangling_heads(
1126 self.prune_factor as usize,
1127 self.min_dangling_branch_length,
1128 self.recover_all_dangling_branches,
1129 dangling_end_sw_parameters,
1130 );
1131 }
1132
1133 // remove all heading and trailing paths
1134 if self.remove_paths_not_connected_to_ref {
1135 rt_graph.remove_paths_not_connected_to_ref()
1136 }
1137
1138 if self.debug_graph_transformations {
1139 self.print_debug_graph_transform_abstract(
1140 &rt_graph,
1141 format!(
1142 "{}_{}-{}-sequenceGraph.{}.0.2.cleaned_readthreading_graph.dot",
1143 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1144 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1145 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1146 kmer_size
1147 ),
1148 );
1149 };
1150
1151 // Either return an assembly result with a sequence graph or with an unchanged
1152 // sequence graph deptending on the kmer duplication behavior
1153 if self.generate_seq_graph {
1154 let mut initial_seq_graph = rt_graph.to_sequence_graph();
1155
1156 if self.debug_graph_transformations {
1157 rt_graph.print_graph(
1158 format!(
1159 "{}_{}-{}-sequenceGraph.{}.0.3.initial_seqgraph.dot",
1160 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1161 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1162 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1163 kmer_size
1164 ),
1165 10000,
1166 );
1167 };
1168
1169 // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
1170 if self.just_return_raw_graph {
1171 return AssemblyResult::new(
1172 Status::AssembledSomeVariation,
1173 Some(initial_seq_graph),
1174 None,
1175 );
1176 }
1177
1178 // debug!(
1179 // "Using kmer size of {} in read threading assembler",
1180 // &rt_graph.get_kmer_size()
1181 // );
1182
1183 if self.debug_graph_transformations {
1184 self.print_debug_graph_transform_abstract(
1185 &rt_graph,
1186 format!(
1187 "{}_{}-{}-sequenceGraph.{}.0.4.initial_seqgraph.dot",
1188 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1189 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1190 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1191 kmer_size
1192 ),
1193 );
1194 };
1195
1196 initial_seq_graph.base_graph.clean_non_ref_paths();
1197 let cleaned: AssemblyResult<SimpleInterval, ReadThreadingGraph> =
1198 self.clean_up_seq_graph(initial_seq_graph, &ref_haplotype);
1199 let status = cleaned.status;
1200 return AssemblyResult::new(status, cleaned.graph, Some(rt_graph));
1201 } else {
1202 // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
1203 if self.just_return_raw_graph {
1204 return AssemblyResult::new(Status::AssembledSomeVariation, None, Some(rt_graph));
1205 }
1206
1207 // debug!(
1208 // "Using kmer size of {} in read threading assembler",
1209 // &rt_graph.get_kmer_size()
1210 // );
1211 let cleaned = Self::get_result_set_for_rt_graph(rt_graph);
1212 return cleaned;
1213 }
1214 }
1215
1216 fn get_result_set_for_rt_graph<A: AbstractReadThreadingGraph>(
1217 rt_graph: A,
1218 ) -> AssemblyResult<SimpleInterval, A> {
1219 // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
1220 // happen in cases where for example the reference somehow manages to acquire a cycle, or
1221 // where the entire assembly collapses back into the reference sequence.
1222 if rt_graph.get_reference_source_vertex().is_none()
1223 || rt_graph.get_reference_sink_vertex().is_none()
1224 {
1225 return AssemblyResult::new(Status::JustAssembledReference, None, Some(rt_graph));
1226 };
1227
1228 return AssemblyResult::new(Status::AssembledSomeVariation, None, Some(rt_graph));
1229 }
1230
1231 // Performs the various transformations necessary on a sequence graph
1232 fn clean_up_seq_graph<A: AbstractReadThreadingGraph>(
1233 &self,
1234 mut seq_graph: SeqGraph<BaseEdgeStruct>,
1235 ref_haplotype: &Haplotype<SimpleInterval>,
1236 ) -> AssemblyResult<SimpleInterval, A> {
1237 if self.debug_graph_transformations {
1238 self.print_debug_graph_transform_seq_graph(
1239 &seq_graph,
1240 format!(
1241 "{}_{}-{}-sequenceGraph.{}.1.0.non_ref_removed.dot",
1242 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1243 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1244 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1245 seq_graph.base_graph.get_kmer_size()
1246 ),
1247 );
1248 };
1249
1250 // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
1251 seq_graph.zip_linear_chains();
1252 if self.debug_graph_transformations {
1253 self.print_debug_graph_transform_seq_graph(
1254 &seq_graph,
1255 format!(
1256 "{}_{}-{}-sequenceGraph.{}.1.1.zipped.dot",
1257 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1258 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1259 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1260 seq_graph.base_graph.get_kmer_size()
1261 ),
1262 );
1263 };
1264
1265 // now go through and prune the graph, removing vertices no longer connected to the reference chain
1266 seq_graph.base_graph.remove_singleton_orphan_vertices();
1267 seq_graph
1268 .base_graph
1269 .remove_vertices_not_connected_to_ref_regardless_of_edge_direction();
1270
1271 if self.debug_graph_transformations {
1272 self.print_debug_graph_transform_seq_graph(
1273 &seq_graph,
1274 format!(
1275 "{}_{}-{}-sequenceGraph.{}.1.2.pruned.dot",
1276 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1277 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1278 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1279 seq_graph.base_graph.get_kmer_size()
1280 ),
1281 );
1282 };
1283
1284 seq_graph.simplify_graph(&format!(
1285 "{}_{}-{}.0",
1286 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1287 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1288 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1289 ));
1290 if self.debug_graph_transformations {
1291 self.print_debug_graph_transform_seq_graph(
1292 &seq_graph,
1293 format!(
1294 "{}_{}-{}-sequenceGraph.{}.1.3.merged.dot",
1295 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1296 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1297 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1298 seq_graph.base_graph.get_kmer_size()
1299 ),
1300 );
1301 };
1302
1303 // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
1304 // happen in cases where for example the reference somehow manages to acquire a cycle, or
1305 // where the entire assembly collapses back into the reference sequence.
1306 if seq_graph.base_graph.get_reference_source_vertex().is_none()
1307 || seq_graph.base_graph.get_reference_sink_vertex().is_none()
1308 {
1309 return AssemblyResult::new(Status::JustAssembledReference, Some(seq_graph), None);
1310 };
1311
1312 seq_graph.base_graph.remove_paths_not_connected_to_ref();
1313 seq_graph.simplify_graph(&format!(
1314 "{}_{}-{}.1",
1315 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1316 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1317 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1318 ));
1319 if seq_graph.base_graph.graph.node_indices().count() == 1 {
1320 // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
1321 // the code from blowing up.
1322 // TODO -- ref properties should really be on the vertices, not the graph itself
1323 let complete = seq_graph.base_graph.graph.node_indices().next().unwrap();
1324 let dummy = SeqVertex::new(Vec::new());
1325 let dummy_index = seq_graph.base_graph.add_node(&dummy);
1326 seq_graph.base_graph.graph.add_edge(
1327 complete,
1328 dummy_index,
1329 BaseEdgeStruct::new(true, 0, 0),
1330 );
1331 };
1332
1333 if self.debug_graph_transformations {
1334 self.print_debug_graph_transform_seq_graph(
1335 &seq_graph,
1336 format!(
1337 "{}_{}-{}-sequenceGraph.{}.1.4.final.dot",
1338 ref_haplotype.genome_location.as_ref().unwrap().tid(),
1339 ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1340 ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1341 seq_graph.base_graph.get_kmer_size(),
1342 ),
1343 );
1344 };
1345
1346 return AssemblyResult::new(Status::AssembledSomeVariation, Some(seq_graph), None);
1347 }
1348
1349 // fn print_seq_graphs(&self, graphs: &Vec<SeqGraph<BaseEdgeStruct>>) {
1350 // let _write_first_graph_with_size_smaller_than = 50;
1351
1352 // for (idx, graph) in graphs.iter().enumerate() {
1353 // graph.base_graph.print_graph(
1354 // &(self.graph_output_path.as_ref().unwrap().to_string() + &idx.to_string()),
1355 // false,
1356 // self.prune_factor as usize,
1357 // )
1358 // }
1359 // }
1360}