lorikeet_genome/read_threading/
read_threading_assembler.rs

1use gkl::smithwaterman::{OverhangStrategy, Parameters};
2use hashlink::LinkedHashMap;
3use rayon::prelude::*;
4use rust_htslib::bam::record::{Cigar, CigarString};
5
6// use crate::read_error_corrector::nearby_kmer_error_corrector::NearbyKmerErrorCorrector;
7use crate::assembly::assembly_region::AssemblyRegion;
8use crate::assembly::assembly_result::{AssemblyResult, Status};
9use crate::assembly::assembly_result_set::AssemblyResultSet;
10use crate::graphs::adaptive_chain_pruner::AdaptiveChainPruner;
11use crate::graphs::base_edge::{BaseEdge, BaseEdgeStruct};
12use crate::graphs::base_graph::BaseGraph;
13use crate::graphs::base_vertex::BaseVertex;
14use crate::graphs::chain_pruner::ChainPruner;
15use crate::graphs::graph_based_k_best_haplotype_finder::GraphBasedKBestHaplotypeFinder;
16use crate::graphs::k_best_haplotype::KBestHaplotype;
17use crate::graphs::seq_graph::SeqGraph;
18use crate::graphs::seq_vertex::SeqVertex;
19use crate::haplotype::haplotype::Haplotype;
20use crate::model::byte_array_allele::Allele;
21use crate::pair_hmm::pair_hmm_likelihood_calculation_engine::AVXMode;
22use crate::graphs::low_weight_chain_pruner::LowWeightChainPruner;
23use crate::read_threading::abstract_read_threading_graph::{AbstractReadThreadingGraph, SequenceForKmers};
24use crate::read_threading::read_threading_graph::ReadThreadingGraph;
25use crate::reads::bird_tool_reads::BirdToolRead;
26use crate::reads::cigar_utils::CigarUtils;
27use crate::reads::read_clipper::ReadClipper;
28use crate::utils::simple_interval::{Locatable, SimpleInterval};
29
30const PRUNE_FACTOR_COVERAGE_THRESHOLD: f64 = 10.0;
31
32#[derive(Debug, Clone)]
33pub struct ReadThreadingAssembler {
34    pub(crate) kmer_sizes: Vec<usize>,
35    dont_increase_kmer_sizes_for_cycles: bool,
36    allow_non_unique_kmers_in_ref: bool,
37    generate_seq_graph: bool,
38    // recover_haplotypes_from_edges_not_covered_in_junction_trees: bool,
39    num_pruning_samples: i32,
40    disable_prune_factor_correction: bool, // if the region has many reads, having a low prune factor can cause excessive runtimes
41    num_best_haplotypes_per_graph: i32,
42    prune_before_cycle_counting: bool,
43    remove_paths_not_connected_to_ref: bool,
44    just_return_raw_graph: bool,
45    pub(crate) recover_dangling_branches: bool,
46    pub(crate) recover_all_dangling_branches: bool,
47    pub(crate) min_dangling_branch_length: i32,
48    pub(crate) min_base_quality_to_use_in_assembly: u8,
49    prune_factor: usize,
50    min_matching_bases_to_dangling_end_recovery: i32,
51    chain_pruner: ChainPruner,
52    pub(crate) debug_graph_transformations: bool,
53    pub(crate) debug_graph_output_path: Option<String>,
54    // graph_haplotype_histogram_path: Option<String>,
55    pub(crate) graph_output_path: Option<String>,
56}
57
58impl ReadThreadingAssembler {
59    const DEFAULT_NUM_PATHS_PER_GRAPH: usize = 128;
60    const KMER_SIZE_ITERATION_INCREASE: usize = 13;
61    const MAX_KMER_ITERATIONS_TO_ATTEMPT: usize = 6;
62
63    /**
64     * If false, we will only write out a region around the reference source
65     */
66    // const PRINT_FILL_GRAPH_FOR_DEBUGGING: bool = true;
67    const DEFAULT_MIN_BASE_QUALITY_TO_USE: u8 = 10;
68    const MIN_HAPLOTYPE_REFERENCE_LENGTH: u32 = 30;
69
70    pub fn new(
71        max_allowed_paths_for_read_threading_assembler: i32,
72        mut kmer_sizes: Vec<usize>,
73        dont_increase_kmer_sizes_for_cycles: bool,
74        allow_non_unique_kmers_in_ref: bool,
75        num_pruning_samples: i32,
76        prune_factor: usize,
77        use_adaptive_pruning: bool,
78        initial_error_rate_for_pruning: f64,
79        pruning_log_odds_threshold: f64,
80        pruning_seeding_log_odds_threshold: f64,
81        max_unpruned_variants: usize,
82        _use_linked_debruijn_graphs: bool,
83        enable_legacy_graph_cycle_detection: bool,
84        min_matching_bases_to_dangle_end_recovery: i32,
85        disable_prune_factor_correction: bool,
86    ) -> ReadThreadingAssembler {
87        assert!(
88            max_allowed_paths_for_read_threading_assembler >= 1,
89            "num_best_haplotypes_per_graph should be >= 1 but got {}",
90            max_allowed_paths_for_read_threading_assembler
91        );
92        kmer_sizes.sort_unstable();
93
94        let chain_pruner = if use_adaptive_pruning {
95            ChainPruner::AdaptiveChainPruner(AdaptiveChainPruner::new(
96                initial_error_rate_for_pruning,
97                pruning_log_odds_threshold,
98                pruning_seeding_log_odds_threshold,
99                max_unpruned_variants,
100            ))
101        } else {
102            ChainPruner::LowWeightChainPruner(LowWeightChainPruner::new(prune_factor))
103        };
104
105        // TODO: //!use_linked_debruijn_graphs should be used for generate_seq_graph
106        //      but have not yet implement junction tree method
107        ReadThreadingAssembler {
108            kmer_sizes,
109            dont_increase_kmer_sizes_for_cycles,
110            allow_non_unique_kmers_in_ref,
111            num_pruning_samples,
112            prune_factor,
113            chain_pruner,
114            generate_seq_graph: true,
115            prune_before_cycle_counting: !enable_legacy_graph_cycle_detection,
116            remove_paths_not_connected_to_ref: true,
117            just_return_raw_graph: false,
118            recover_dangling_branches: true,
119            recover_all_dangling_branches: false,
120            min_dangling_branch_length: 0,
121            num_best_haplotypes_per_graph: max_allowed_paths_for_read_threading_assembler,
122            min_matching_bases_to_dangling_end_recovery: min_matching_bases_to_dangle_end_recovery,
123            // recover_haplotypes_from_edges_not_covered_in_junction_trees: true,
124            min_base_quality_to_use_in_assembly: Self::DEFAULT_MIN_BASE_QUALITY_TO_USE,
125            debug_graph_transformations: false,
126            debug_graph_output_path: Some(format!("graph_debugging")),
127            // graph_haplotype_histogram_path: None,
128            graph_output_path: None,
129            disable_prune_factor_correction
130        }
131    }
132
133    pub fn default() -> Self {
134        Self::new(
135            Self::DEFAULT_NUM_PATHS_PER_GRAPH as i32,
136            vec![25],
137            true,
138            true,
139            1,
140            2,
141            false,
142            0.001,
143            2.0,
144            2.0,
145            std::usize::MAX,
146            false,
147            false,
148            3,
149            false,
150        )
151    }
152
153    pub fn default_with_kmers(
154        max_allowed_paths_for_read_threading_assembler: i32,
155        kmer_sizes: Vec<usize>,
156        prune_factor: usize,
157    ) -> Self {
158        Self::new(
159            max_allowed_paths_for_read_threading_assembler,
160            kmer_sizes,
161            true,
162            true,
163            1,
164            prune_factor,
165            false,
166            0.001,
167            2.0,
168            2.0,
169            std::usize::MAX,
170            false,
171            false,
172            3,
173            false
174        )
175    }
176
177    pub fn set_just_return_raw_graph(&mut self, value: bool) {
178        self.just_return_raw_graph = value;
179    }
180
181    pub fn set_remove_paths_not_connected_to_ref(&mut self, value: bool) {
182        self.remove_paths_not_connected_to_ref = value;
183    }
184
185    pub fn set_recover_dangling_branches(&mut self, value: bool) {
186        self.recover_dangling_branches = value;
187    }
188
189    fn set_prune_factor(&mut self, value: usize) {
190        self.prune_factor = value;
191        self.chain_pruner.set_prune_factor(value);
192    }
193    /**
194     * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
195     * @param assemblyRegion              AssemblyRegion object holding the reads which are to be used during assembly
196     * @param refHaplotype              reference haplotype object
197     * @param fullReferenceWithPadding  byte array holding the reference sequence with padding
198     * @param refLoc                    GenomeLoc object corresponding to the reference sequence with padding
199     * @param readErrorCorrector        a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
200     * @param aligner                   {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence
201     * @return                          the resulting assembly-result-set
202     */
203    pub fn run_local_assembly<'b>(
204        &mut self,
205        mut assembly_region: AssemblyRegion,
206        ref_haplotype: &'b mut Haplotype<SimpleInterval>,
207        full_reference_with_padding: Vec<u8>,
208        ref_loc: SimpleInterval,
209        // read_error_corrector: Option<C>,
210        sample_names: &'b [String],
211        dangling_end_sw_parameters: Parameters,
212        reference_to_haplotype_sw_parameters: Parameters,
213        avx_mode: AVXMode,
214        additional_kmer_sizes: Option<Vec<usize>>
215    ) -> AssemblyResultSet<ReadThreadingGraph> {
216        assert!(
217            full_reference_with_padding.len() == ref_loc.size(),
218            "Reference bases and reference loc must be the same size. {} -> {}",
219            full_reference_with_padding.len(),
220            ref_loc.size()
221        );
222
223        // Note that error correction does not modify the original reads,
224        // which are used for genotyping TODO this might come before error correction /
225        // let mut corrected_reads = assembly_region.get_reads_cloned();
226        // match read_error_corrector {
227        //     // TODO: Is it possible to perform this
228        //     //      without cloning? Perhaps get_reads() should just return ownership of reads?
229        //     //      Can't move reads out of assembly region as they are required later on during
230        //     //      read threading phase. Very annoying
231        //     None => assembly_region.get_reads_cloned(),
232        //     Some(mut read_error_corrector) => {
233        //         read_error_corrector.correct_reads(assembly_region.get_reads_cloned())
234        //     }
235        // };
236
237        // Revert clipped bases if necessary (since we do not want to assemble them)
238        let corrected_reads = assembly_region.move_reads();
239        let corrected_reads = corrected_reads
240            .into_par_iter()
241            .map(|read| ReadClipper::new(read).hard_clip_soft_clipped_bases())
242            .collect::<Vec<BirdToolRead>>();
243
244        // calculate coverage estimate. no. reads / region size
245        let old_prune_factor = self.prune_factor;
246        if !self.disable_prune_factor_correction && !self.chain_pruner.is_adaptive() {
247            let coverage = assembly_region.calculate_coverage(&corrected_reads);
248            // debug!("Coverage {} read count {} region size {}", coverage, corrected_reads.len(), assembly_region.get_span().size());
249            let new_prune_factor = if coverage > PRUNE_FACTOR_COVERAGE_THRESHOLD {
250                2
251            } else {
252                0
253            };
254            self.set_prune_factor(new_prune_factor);
255        }
256
257
258        // debug!("Corrected reads {}", corrected_reads.len());
259        // let non_ref_rt_graphs: Vec<ReadThreadingGraph> = Vec::new();
260        // let non_ref_seq_graphs: Vec<SeqGraph<BaseEdgeStruct>> = Vec::new();
261        let active_region_extended_location = assembly_region.get_padded_span();
262        ref_haplotype.set_genome_location(active_region_extended_location.clone());
263
264        let mut result_set = AssemblyResultSet::new(
265            assembly_region,
266            full_reference_with_padding,
267            ref_loc.clone(),
268            ref_haplotype.clone(),
269        );
270
271        // either follow the old method for building graphs and then assembling or assemble and haplotype call before expanding kmers
272        if self.generate_seq_graph {
273            self.assemble_kmer_graphs_and_haplotype_call(
274                &ref_haplotype,
275                &ref_loc,
276                &corrected_reads,
277                &mut result_set,
278                &active_region_extended_location,
279                sample_names,
280                &dangling_end_sw_parameters,
281                &reference_to_haplotype_sw_parameters,
282                avx_mode,
283                additional_kmer_sizes
284            );
285        } else {
286            self.assemble_graphs_and_expand_kmers_given_haplotypes(
287                &ref_haplotype,
288                &ref_loc,
289                &corrected_reads,
290                &mut result_set,
291                &active_region_extended_location,
292                sample_names,
293                &dangling_end_sw_parameters,
294                &reference_to_haplotype_sw_parameters,
295                avx_mode,
296                additional_kmer_sizes
297            )
298        }
299
300        // reset prune_factor
301        self.set_prune_factor(old_prune_factor);
302
303        // If we get to this point then no graph worked... thats bad and indicates something
304        // horrible happened, in this case we just return a reference haplotype
305        result_set.region_for_genotyping.reads = corrected_reads;
306        // debug!(
307        //     "Found {} to compare every read against",
308        //     result_set.haplotypes.len()
309        // );
310        result_set
311    }
312
313    /**
314     * Follow the old behavior, call into {@link #assemble(List, Haplotype, SAMFileHeader, SmithWatermanAligner)} to decide if a graph
315     * is acceptable for haplotype discovery then detect haplotypes.
316     */
317    fn assemble_kmer_graphs_and_haplotype_call<'b>(
318        &mut self,
319        ref_haplotype: &'b Haplotype<SimpleInterval>,
320        ref_loc: &'b SimpleInterval,
321        corrected_reads: &'b Vec<BirdToolRead>,
322        // non_ref_seq_graphs: &mut Vec<SeqGraph<BaseEdgeStruct>>,
323        result_set: &mut AssemblyResultSet<ReadThreadingGraph>,
324        active_region_extended_location: &'b SimpleInterval,
325        sample_names: &'b [String],
326        dangling_end_sw_parameters: &Parameters,
327        reference_to_haplotype_sw_parameters: &Parameters,
328        avx_mode: AVXMode,
329        additional_kmer_sizes: Option<Vec<usize>>
330    ) {
331        // create the graphs by calling our subclass assemble method
332        self.assemble(
333            &corrected_reads,
334            ref_haplotype,
335            sample_names,
336            dangling_end_sw_parameters,
337            avx_mode,
338            additional_kmer_sizes
339        )
340        .into_iter()
341        .for_each(|mut result| {
342            // debug!("graph after assembly {:?}", &result.graph.as_ref().unwrap().base_graph);
343            // debug!(
344            //     "Result loc {:?} Status {:?} haps {:?}",
345            //     active_region_extended_location, &result.status, &result.discovered_haplotypes
346            // );
347
348            if result.status == Status::AssembledSomeVariation {
349                // do some QC on the graph
350                Self::sanity_check_graph(&result.graph.as_ref().unwrap().base_graph, ref_haplotype);
351                // add it to graphs with meaningful non-reference features
352                self.find_best_path(
353                    &mut result,
354                    ref_haplotype,
355                    ref_loc,
356                    active_region_extended_location,
357                    reference_to_haplotype_sw_parameters,
358                    result_set,
359                    avx_mode,
360                );
361                // non_ref_seq_graphs.push(result.graph.unwrap());
362                // result_set.add_haplotype(result);
363            }
364        });
365    }
366
367    /**
368     * Given reads and a reference haplotype give us graphs to use for constructing
369     * non-reference haplotypes.
370     *
371     * @param reads the reads we're going to assemble
372     * @param refHaplotype the reference haplotype
373     * @param aligner {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence
374     * @return a non-null list of reads
375     */
376    pub fn assemble<'b>(
377        &mut self,
378        reads: &'b Vec<BirdToolRead>,
379        ref_haplotype: &'b Haplotype<SimpleInterval>,
380        sample_names: &'b [String],
381        dangling_end_sw_parameters: &Parameters,
382        avx_mode: AVXMode,
383        additional_kmer_sizes: Option<Vec<usize>>,
384    ) -> Vec<AssemblyResult<SimpleInterval, ReadThreadingGraph>> {
385
386        let mut kmer_sizes = self.kmer_sizes.clone();
387        if let Some(additional) = additional_kmer_sizes {
388            kmer_sizes.extend(additional);
389        }
390
391        // try using the requested kmer sizes
392        let mut results = kmer_sizes
393            .par_iter()
394            .filter_map(|kmer_size| {
395                self.create_graph(
396                    reads,
397                    ref_haplotype,
398                    *kmer_size,
399                    self.dont_increase_kmer_sizes_for_cycles,
400                    self.allow_non_unique_kmers_in_ref,
401                    sample_names,
402                    dangling_end_sw_parameters,
403                    avx_mode,
404                )
405                // {
406                //     None => continue,
407                //     Some(assembly_result) => {
408                //         debug!(
409                //             "Found assembly result (No increase) graph -> {:?}",
410                //             assembly_result.graph.as_ref().unwrap().base_graph
411                //         );
412                //         results.push(assembly_result)
413                //     }
414                // }
415            })
416            .collect::<Vec<AssemblyResult<SimpleInterval, ReadThreadingGraph>>>();
417        
418
419        if results.is_empty() && !self.dont_increase_kmer_sizes_for_cycles {
420            let mut kmer_size =
421                *self.kmer_sizes.iter().max().unwrap() + Self::KMER_SIZE_ITERATION_INCREASE;
422            // if kmer_size is even, add 1 to make it odd
423            if kmer_size % 2 == 0 {
424                kmer_size += 1;
425            }
426            let mut num_iterations = 1;
427            while results.is_empty() && num_iterations <= Self::MAX_KMER_ITERATIONS_TO_ATTEMPT {
428                // on the last attempt we will allow low complexity graphs
429                let last_attempt = num_iterations == Self::MAX_KMER_ITERATIONS_TO_ATTEMPT;
430                match self.create_graph(
431                    reads,
432                    &ref_haplotype,
433                    kmer_size,
434                    last_attempt,
435                    last_attempt,
436                    sample_names,
437                    dangling_end_sw_parameters,
438                    avx_mode,
439                ) {
440                    None => {
441                        // pass
442                    }
443                    Some(assembly_result) => {
444                        results.push(assembly_result);
445                    }
446                };
447                kmer_size += Self::KMER_SIZE_ITERATION_INCREASE;
448                num_iterations += 1;
449            }
450        }
451
452        return results;
453    }
454
455    /**
456     * Follow the kmer expansion heurisics as {@link #assemble(List, Haplotype, SAMFileHeader, SmithWatermanAligner)}, but in this case
457     * attempt to recover haplotypes from the kmer graph and use them to assess whether to expand the kmer size.
458     */
459    fn assemble_graphs_and_expand_kmers_given_haplotypes<'b>(
460        &mut self,
461        ref_haplotype: &'b Haplotype<SimpleInterval>,
462        ref_loc: &'b SimpleInterval,
463        corrected_reads: &'b Vec<BirdToolRead>,
464        result_set: &mut AssemblyResultSet<ReadThreadingGraph>,
465        active_region_extended_location: &'b SimpleInterval,
466        sample_names: &'b [String],
467        dangling_end_sw_parameters: &Parameters,
468        reference_to_haplotype_sw_parameters: &Parameters,
469        avx_mode: AVXMode,
470        additional_kmer_sizes: Option<Vec<usize>>
471    ) {
472        let mut saved_assembly_results = Vec::new();
473
474        let mut has_adequately_assembled_graph = false;
475        let kmers_to_try = self.get_expanded_kmer_list(additional_kmer_sizes);
476        // first, try using the requested kmer sizes
477        for i in 0..kmers_to_try.len() {
478            let kmer_size = kmers_to_try[i];
479            let is_last_cycle = i == kmers_to_try.len() - 1;
480            if !has_adequately_assembled_graph {
481                let assembled_result = self.create_graph(
482                    corrected_reads,
483                    &ref_haplotype,
484                    kmer_size,
485                    is_last_cycle || self.dont_increase_kmer_sizes_for_cycles,
486                    is_last_cycle || self.allow_non_unique_kmers_in_ref,
487                    &sample_names,
488                    dangling_end_sw_parameters,
489                    avx_mode,
490                );
491                match assembled_result {
492                    None => {} //pass
493                    Some(mut assembled_result) => {
494                        if assembled_result.status == Status::AssembledSomeVariation {
495                            // do some QC on the graph
496                            Self::sanity_check_graph(
497                                assembled_result
498                                    .threading_graph
499                                    .as_ref()
500                                    .unwrap()
501                                    .get_base_graph(),
502                                &ref_haplotype,
503                            );
504                            let _ = &mut assembled_result
505                                .threading_graph
506                                .as_mut()
507                                .unwrap()
508                                .post_process_for_haplotype_finding(
509                                    self.debug_graph_output_path.as_ref(),
510                                    ref_haplotype.genome_location.as_ref().unwrap(),
511                                );
512                            // add it to graphs with meaningful non-reference features
513                            // non_ref_rt_graphs.push(assembled_result.threading_graph.unwrap().clone());
514                            // if graph
515                            // TODO: Add histogram plotting
516                            // let graph = assembled_result.threading_graph.as_ref().unwrap();
517                            self.find_best_path(
518                                &mut assembled_result,
519                                ref_haplotype,
520                                ref_loc,
521                                active_region_extended_location,
522                                reference_to_haplotype_sw_parameters,
523                                result_set,
524                                avx_mode,
525                            );
526
527                            saved_assembly_results.push(assembled_result);
528                            //TODO LOGIC PLAN HERE - we want to check if we have a trustworthy graph (i.e. no badly assembled haplotypes) if we do, emit it.
529                            //TODO                 - but if we failed to assemble due to excessive looping or did have badly assembled haplotypes then we expand kmer size.
530                            //TODO                 - If we get no variation
531
532                            // if asssembly didn't fail ( which is a degenerate case that occurs for some subset of graphs with difficult loop
533                            if !saved_assembly_results
534                                .last()
535                                .unwrap()
536                                .discovered_haplotypes
537                                .is_empty()
538                            {
539                                // we have found our workable kmer size so lets add the results and finish
540                                let assembled_result = saved_assembly_results.last().unwrap();
541                                if !assembled_result.contains_suspect_haploptypes {
542                                    // let mut result_set = result_set.lock().unwrap();
543                                    for h in assembled_result.discovered_haplotypes.clone() {
544                                        result_set.add_haplotype(h);
545                                    }
546
547                                    has_adequately_assembled_graph = true;
548                                }
549                            }
550                        } else if assembled_result.status == Status::JustAssembledReference {
551                            has_adequately_assembled_graph = true;
552                        }
553                    }
554                }
555            }
556        }
557
558        // This indicates that we have thrown everything away... we should go back and
559        // check that we weren't too conservative about assembly results that might
560        // otherwise be good
561        if !has_adequately_assembled_graph {
562            // search for the last haplotype set that had any results, if none are found just return me
563            // In this case we prefer the last meaningful kmer size if possible
564            // for result in saved_assembly_results.
565            saved_assembly_results.reverse();
566            for result in saved_assembly_results {
567                if result.discovered_haplotypes.len() > 1 {
568                    // let mut result_set = result_set.lock().unwrap();
569                    let ar_index = result_set.add_assembly_result(result);
570                    for h in result_set.assembly_results[ar_index]
571                        .discovered_haplotypes
572                        .clone()
573                    {
574                        result_set.add_haplotype_and_assembly_result(h, ar_index);
575                    }
576                    break;
577                }
578            }
579        }
580    }
581
582    /**
583     * Make sure the reference sequence is properly represented in the provided graph
584     *
585     * @param graph the graph to check
586     * @param refHaplotype the reference haplotype
587     */
588    fn sanity_check_graph<'b, V: BaseVertex, E: BaseEdge>(
589        graph: &'b BaseGraph<V, E>,
590        ref_haplotype: &'b Haplotype<SimpleInterval>,
591    ) {
592        let ref_source_vertex = graph.get_reference_source_vertex();
593        let ref_sink_vertex = graph.get_reference_sink_vertex();
594        if ref_source_vertex.is_none() {
595            panic!("All reference graphs must have a reference source vertex");
596        };
597
598        if ref_sink_vertex.is_none() {
599            panic!("All reference graphs must have a reference sink vertex");
600        };
601
602        if graph.get_reference_bytes(ref_source_vertex.unwrap(), ref_sink_vertex, true, true)
603            != ref_haplotype.get_bases()
604        {
605            panic!(
606                "Mismatch between the reference haplotype and the reference assembly graph path for \n\
607                 +++++++++ \n\
608                 graph     = {} \n\
609                 haplotype = {} \n\
610                 loc       = {:?} \n\
611                 +++++++++ \n",
612                std::str::from_utf8(
613                    graph
614                        .get_reference_bytes(
615                            ref_source_vertex.unwrap(),
616                            ref_sink_vertex,
617                            true,
618                            true
619                        )
620                        .as_slice()
621                )
622                .unwrap(),
623                std::str::from_utf8(ref_haplotype.get_bases()).unwrap(),
624                &ref_haplotype.genome_location
625            );
626        };
627    }
628
629    /**
630     * Method for getting a list of all of the specified kmer sizes to test for the graph including kmer expansions
631     * @return
632     */
633    fn get_expanded_kmer_list(&self, additional_kmer_sizes: Option<Vec<usize>>) -> Vec<usize> {
634        let mut return_list = Vec::new();
635        return_list.extend(self.kmer_sizes.iter());
636        if !self.dont_increase_kmer_sizes_for_cycles {
637            let mut kmer_size =
638                self.kmer_sizes.iter().max().unwrap() + Self::KMER_SIZE_ITERATION_INCREASE;
639            let mut num_iterations = 1;
640            while num_iterations <= Self::MAX_KMER_ITERATIONS_TO_ATTEMPT {
641                return_list.push(kmer_size);
642                kmer_size += Self::KMER_SIZE_ITERATION_INCREASE;
643                num_iterations += 1;
644            }
645        }
646
647        if let Some(additional_kmer_sizes) = additional_kmer_sizes {
648            return_list.extend(additional_kmer_sizes.iter());
649        }
650
651        return return_list;
652    }
653
654    /**
655     * Print graph to file NOTE this requires that debugGraphTransformations be enabled.
656     *
657     * @param graph the graph to print
658     * @param fileName the name to give the graph file
659     */
660    fn print_debug_graph_transform_abstract<A: AbstractReadThreadingGraph>(
661        &self,
662        graph: &A,
663        file_name: String,
664    ) {
665        // if Self::PRINT_FILL_GRAPH_FOR_DEBUGGING {
666        //     graph.print_graph(file_name, self.prune_factor as usize)
667        // } else {
668        //     grap
669        // }
670        graph.print_graph(file_name, self.prune_factor as usize)
671    }
672
673    /**
674     * Print graph to file NOTE this requires that debugGraphTransformations be enabled.
675     *
676     * @param graph the graph to print
677     * @param fileName the name to give the graph file
678     */
679    fn print_debug_graph_transform_seq_graph<E: BaseEdge>(
680        &self,
681        graph: &SeqGraph<E>,
682        file_name: String,
683    ) {
684        // if Self::PRINT_FILL_GRAPH_FOR_DEBUGGING {
685        //     graph.print_graph(file_name, self.prune_factor as usize)
686        // } else {
687        //     grap
688        // }
689        graph
690            .base_graph
691            .print_graph(&file_name, true, self.prune_factor as usize)
692    }
693
694    /**
695     * Find discover paths by using KBestHaplotypeFinder over each graph.
696     *
697     * This method has the side effect that it will annotate all of the AssemblyResults objects with the derived haplotypes
698     * which can be used for basing kmer graph pruning on the discovered haplotypes.
699     *
700     * @param graph                 graph to be used for kmer detection
701     * @param assemblyResults       assembly results objects for this graph
702     * @param refHaplotype          reference haplotype
703     * @param refLoc                location of reference haplotype
704     * @param activeRegionWindow    window of the active region (without padding)
705     * @param resultSet             (can be null) the results set into which to deposit discovered haplotypes
706     * @param aligner               SmithWaterman aligner to use for aligning the discovered haplotype to the reference haplotype
707     * @return A list of discovered haplotyes (note that this is not currently used for anything)
708     */
709    fn find_best_path<'b, A: AbstractReadThreadingGraph>(
710        &self,
711        // graph: &BaseGraph<V, E>,
712        assembly_result: &'b mut AssemblyResult<SimpleInterval, A>,
713        ref_haplotype: &'b Haplotype<SimpleInterval>,
714        _ref_loc: &'b SimpleInterval,
715        active_region_window: &'b SimpleInterval,
716        haplotype_to_reference_sw_parameters: &Parameters,
717        result_set: &mut AssemblyResultSet<A>,
718        avx_mode: AVXMode,
719    ) {
720        // add the reference haplotype separately from all the others to ensure
721        // that it is present in the list of haplotypes
722        // let mut return_haplotypes = LinkedHashSet::new();
723        let active_region_start = ref_haplotype.alignment_start_hap_wrt_ref;
724        let mut failed_cigars = 0;
725        {
726            // Validate that the graph is valid with extant source and sink before operating
727            let source = assembly_result
728                .graph
729                .as_ref()
730                .unwrap()
731                .base_graph
732                .get_reference_source_vertex();
733            let sink = assembly_result
734                .graph
735                .as_ref()
736                .unwrap()
737                .base_graph
738                .get_reference_sink_vertex();
739            assert!(
740                source.is_some() && sink.is_some(),
741                "Both source and sink cannot be null"
742            );
743
744            let k_best_haplotypes: Box<Vec<KBestHaplotype>> = if self.generate_seq_graph {
745                Box::new(
746                    GraphBasedKBestHaplotypeFinder::new_from_singletons(
747                        &mut assembly_result.graph.as_mut().unwrap().base_graph,
748                        source.unwrap(),
749                        sink.unwrap(),
750                    )
751                    .find_best_haplotypes(
752                        self.num_best_haplotypes_per_graph as usize,
753                        &assembly_result.graph.as_ref().unwrap().base_graph,
754                    ),
755                )
756            } else {
757                // TODO: JunctionTreeKBestHaplotype looks munted and I haven't implemented the other
758                //       JunctionTree stuff so skipping for now
759                panic!("JunctionTree not yet supported, please set generate_seq_graph to true")
760            };
761
762            for k_best_haplotype in k_best_haplotypes.into_iter() {
763                // TODO for now this seems like the solution, perhaps in the future it will be to excise the haplotype completely)
764                // TODO: Lorikeet note, some weird Java shit happens here, will need a work around when
765                //       junction tree is implemented
766                let mut h =
767                    k_best_haplotype.haplotype(&assembly_result.graph.as_ref().unwrap().base_graph);
768                h.kmer_size = k_best_haplotype.kmer_size;
769
770                if !result_set.haplotypes.contains(&h) {
771                    // debug!(
772                    //     "Potential location {:?} potential haplotype {:?}",
773                    //     active_region_window, &h
774                    // );
775                    // TODO this score seems to be irrelevant at this point...
776                    // if k_best_haplotype.is_reference {
777                    //     ref_haplotype.score = OrderedFloat(k_best_haplotype.score);
778                    // };
779
780                    // debug!("+++++++++==================================== Candidates ====================================+++++++++");
781                    // debug!(
782                    //     "ref -> {}",
783                    //     std::str::from_utf8(ref_haplotype.get_bases()).unwrap()
784                    // );
785                    // debug!("alt -> {}", std::str::from_utf8(h.get_bases()).unwrap());
786                    // debug!("+++++++++====================================++++++++++++====================================+++++++++");
787                    let cigar = CigarUtils::calculate_cigar(
788                        ref_haplotype.get_bases(),
789                        h.get_bases(),
790                        OverhangStrategy::SoftClip,
791                        haplotype_to_reference_sw_parameters,
792                        avx_mode,
793                    );
794
795                    match cigar {
796                        None => {
797                            failed_cigars += 1;
798                            continue;
799                        }
800                        Some(cigar) => {
801                            if cigar.is_empty() {
802                                panic!(
803                                    "Smith-Waterman alignment failure. Cigar = {:?}, with reference \
804                                length {} but expecting reference length of {}",
805                                    &cigar, CigarUtils::get_reference_length(&cigar),
806                                    CigarUtils::get_reference_length(&ref_haplotype.cigar)
807                                )
808                            } else if Self::path_is_too_divergent_from_reference(&cigar)
809                                || CigarUtils::get_reference_length(&cigar)
810                                    < Self::MIN_HAPLOTYPE_REFERENCE_LENGTH
811                            {
812                                // N cigar elements means that a bubble was too divergent from the reference so skip over this path
813                                continue;
814                            } else if CigarUtils::get_reference_length(&cigar)
815                                != CigarUtils::get_reference_length(&ref_haplotype.cigar)
816                            {
817                                // the SOFTCLIP strategy can produce a haplotype cigar that matches the beginning of the reference and
818                                // skips the latter part of the reference.  For example, when padded haplotype = NNNNNNNNNN[sequence 1]NNNNNNNNNN
819                                // and padded ref = NNNNNNNNNN[sequence 1][sequence 2]NNNNNNNNNN, the alignment may choose to align only sequence 1.
820                                // If aligning with an indel strategy produces a cigar with deletions for sequence 2 (which is reflected in the
821                                // reference length of the cigar matching the reference length of the ref haplotype), then the assembly window was
822                                // simply too small to reliably resolve the deletion; it should only throw an IllegalStateException when aligning
823                                // with the INDEL strategy still produces discrepant reference lengths.
824                                // You might wonder why not just use the INDEL strategy from the beginning.  This is because the SOFTCLIP strategy only fails
825                                // when there is insufficient flanking sequence to resolve the cigar unambiguously.  The INDEL strategy would produce
826                                // valid but most likely spurious indel cigars.
827                                let cigar_with_indel_strategy = CigarUtils::calculate_cigar(
828                                    ref_haplotype.get_bases(),
829                                    h.get_bases(),
830                                    OverhangStrategy::InDel,
831                                    haplotype_to_reference_sw_parameters,
832                                    avx_mode,
833                                );
834
835                                match cigar_with_indel_strategy {
836                                    None => panic!("Smith-Waterman Alignment failure. No result"),
837                                    Some(cigar_with_indel_strategy) => {
838                                        if CigarUtils::get_reference_length(
839                                            &cigar_with_indel_strategy,
840                                        ) == CigarUtils::get_reference_length(
841                                            &ref_haplotype.cigar,
842                                        ) {
843                                            failed_cigars += 1;
844                                            continue;
845                                        } else {
846                                            panic!(
847                                                "Smith-Waterman alignment failure. Cigar = {:?} with \
848                                            reference length {} but expecting reference length of \
849                                            {} ref = {:?} path = {:?}", &cigar,
850                                                CigarUtils::get_reference_length(&cigar),
851                                                CigarUtils::get_reference_length(&ref_haplotype.cigar),
852                                                std::str::from_utf8(ref_haplotype.get_bases()),
853                                                std::str::from_utf8(h.get_bases()),
854                                            )
855                                        }
856                                    }
857                                }
858                            }
859
860                            h.cigar = cigar;
861                            h.alignment_start_hap_wrt_ref = active_region_start;
862                            h.genome_location = Some(active_region_window.clone());
863                            // debug!(
864                            //     "Adding haplotype {:?} from graph with kmer {}",
865                            //     &h.cigar,
866                            //     assembly_result
867                            //         .graph
868                            //         .as_ref()
869                            //         .unwrap()
870                            //         .base_graph
871                            //         .get_kmer_size()
872                            // );
873                            // return_haplotypes.insert(h.clone());
874                            // result set would get added to here
875                            // let mut result_set = result_set.lock().unwrap();
876                            result_set.add_haplotype(h);
877                        }
878                    }
879                }
880            }
881        }
882
883        // Make sure that the ref haplotype is amongst the return haplotypes and calculate its score as
884        // the first returned by any finder.
885        // HERE we want to preserve the signal that assembly failed completely so in this case we don't add anything to the empty list
886        // if !result_set.haplotypes.is_empty() && !result_set.haplotypes.contains(ref_haplotype) {
887        //     return_haplotypes.insert(ref_haplotype.clone());
888        // };
889
890        if failed_cigars != 0 {
891            // debug!(
892            //     "Failed to align some haplotypes ({}) back to the reference (loc={:?}); \
893            // these will be ignored",
894            //     failed_cigars, ref_loc
895            // )
896        }
897
898        // assembly_result.set_discovered_haplotypes(return_haplotypes);
899    }
900
901    /**
902     * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
903     * @param c the cigar to test
904     * @return  true if we should skip over this path
905     */
906    fn path_is_too_divergent_from_reference(c: &CigarString) -> bool {
907        return c.0.iter().any(|ce| match ce {
908            Cigar::RefSkip(_) => true,
909            _ => false,
910        });
911    }
912
913    /**
914     * Creates the sequence graph for the given kmerSize
915     *
916     * @param reads            reads to use
917     * @param refHaplotype     reference haplotype
918     * @param kmerSize         kmer size
919     * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
920     * @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
921     * @param aligner {@link SmithWatermanAligner} used to align dangling ends to the reference sequence
922     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
923     */
924    fn create_graph<'b>(
925        &self,
926        reads: &'b Vec<BirdToolRead>,
927        ref_haplotype: &'b Haplotype<SimpleInterval>,
928        kmer_size: usize,
929        allow_low_complexity_graphs: bool,
930        _allow_non_unique_kmers_in_ref: bool,
931        sample_names: &'b [String],
932        dangling_end_sw_parameters: &Parameters,
933        avx_mode: AVXMode,
934    ) -> Option<AssemblyResult<SimpleInterval, ReadThreadingGraph>> {
935        if ref_haplotype.len() < kmer_size {
936            // happens in cases where the assembled region is just too small
937            return Some(AssemblyResult::new(Status::Failed, None, None));
938        }
939
940        if !self.allow_non_unique_kmers_in_ref
941            && !ReadThreadingGraph::determine_non_unique_kmers(
942                &SequenceForKmers::new(
943                    "ref".to_string(),
944                    ref_haplotype.get_bases(),
945                    0,
946                    ref_haplotype.get_bases().len(),
947                    1,
948                    true,
949                ),
950                kmer_size,
951            )
952            .is_empty()
953        {
954            // debug!("Not using kmer size of {kmer_size} in read threading assembler because reference contains non-unique kmers");
955            return None;
956        }
957
958        let mut rt_graph =
959        // if self.generate_seq_graph {
960            ReadThreadingGraph::new(
961                kmer_size,
962                false,
963                self.min_base_quality_to_use_in_assembly,
964                self.num_pruning_samples as usize,
965                self.min_matching_bases_to_dangling_end_recovery,
966                avx_mode
967            );
968        // } else {
969        //     // This is where the junction tree debruijn graph would go but considering it is experimental
970        //     // we will leave it out for now
971        //     ReadThreadingGraph::new(
972        //         kmer_size,
973        //         false,
974        //         self.min_base_quality_to_use_in_assembly,
975        //         self.num_pruning_samples as usize,
976        //         self.min_matching_bases_to_dangling_end_recovery,
977        //     )
978        // };
979
980        rt_graph.set_threading_start_only_at_existing_vertex(!self.recover_dangling_branches);
981
982        // add the reference sequence to the graph
983        let mut pending = LinkedHashMap::new();
984        rt_graph.add_sequence(
985            &mut pending,
986            "ref".to_string(),
987            // ReadThreadingGraph::ANONYMOUS_SAMPLE,
988            std::usize::MAX,
989            ref_haplotype.get_bases(),
990            0,
991            ref_haplotype.get_bases().len(),
992            1,
993            true,
994        );
995        // debug!(
996        //     "1 - Graph Kmer {} Edges {} Nodes {}",
997        //     kmer_size,
998        //     rt_graph.base_graph.graph.edge_count(),
999        //     rt_graph.base_graph.graph.node_count()
1000        // );
1001
1002        // Next pull kmers out of every read and throw them on the graph
1003        // debug!("1.5 - Reads {}", reads.len());
1004        let mut count = 0;
1005
1006        let mut sample_count = LinkedHashMap::new();
1007        // let mut read_debugging = false;
1008        for read in reads {
1009            let s_count = sample_count.entry(read.sample_index).or_insert(0);
1010            *s_count += 1;
1011            // if read.name() == b"DFDW01000005.1-5" {
1012            //     // debug!("Read {:?}", read);
1013            //     read_debugging = true;
1014            // };
1015            rt_graph.add_read(read, sample_names, &mut count, &mut pending)
1016        }
1017        // debug!("1.5 - Count {} -> {:?}", count, sample_count);
1018        // let pending = rt_graph.get_pending(); // retrieve pending sequences and clear pending from graph
1019        // actually build the read threading graph
1020        rt_graph.build_graph_if_necessary(&mut pending);
1021        // debug!(
1022        //     "2 - Graph Kmer {} Edges {} Nodes {}",
1023        //     kmer_size,
1024        //     rt_graph.base_graph.graph.edge_count(),
1025        //     rt_graph.base_graph.graph.node_count()
1026        // );
1027
1028        if self.debug_graph_transformations {
1029            self.print_debug_graph_transform_abstract(
1030                &rt_graph,
1031                format!(
1032                    "{}_{}-{}-sequenceGraph.{}.0.0.raw_threading_graph.dot",
1033                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1034                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1035                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1036                    kmer_size
1037                ),
1038            )
1039        }
1040
1041        // It's important to prune before recovering dangling ends so that we don't waste time recovering bad ends.
1042        // It's also important to prune before checking for cycles so that sequencing errors don't create false cycles
1043        // and unnecessarily abort assembly
1044        if self.prune_before_cycle_counting {
1045            self.chain_pruner
1046                .prune_low_weight_chains(rt_graph.get_base_graph_mut());
1047        }
1048        // debug!(
1049        //     "3 - Graph Kmer {} Edges {} Nodes {}",
1050        //     kmer_size,
1051        //     rt_graph.base_graph.graph.edge_count(),
1052        //     rt_graph.base_graph.graph.node_count()
1053        // );
1054
1055        // sanity check: make sure there are no cycles in the graph, unless we are in experimental mode
1056        if self.generate_seq_graph && rt_graph.has_cycles() {
1057            // debug!(
1058            //     "Not using kmer size of {}  in read threading assembler \
1059            //         because it contains a cycle",
1060            //     kmer_size
1061            // );
1062            return None;
1063        }
1064
1065        // sanity check: make sure the graph had enough complexity with the given kmer
1066        if !allow_low_complexity_graphs && rt_graph.is_low_quality_graph() {
1067            // debug!(
1068            //     "Not using kmer size of {} in read threading assembler because it does not \
1069            //         produce a graph with enough complexity",
1070            //     kmer_size
1071            // );
1072            return None;
1073        }
1074
1075        let result = self.get_assembly_result(
1076            ref_haplotype,
1077            kmer_size,
1078            rt_graph,
1079            dangling_end_sw_parameters,
1080        );
1081        // check whether recovering dangling ends created cycles
1082        if self.recover_all_dangling_branches
1083            && result.threading_graph.as_ref().unwrap().has_cycles()
1084        {
1085            return None;
1086        }
1087
1088        return Some(result);
1089    }
1090
1091    fn get_assembly_result<A: AbstractReadThreadingGraph>(
1092        &self,
1093        ref_haplotype: &Haplotype<SimpleInterval>,
1094        kmer_size: usize,
1095        mut rt_graph: A,
1096        dangling_end_sw_parameters: &Parameters,
1097    ) -> AssemblyResult<SimpleInterval, A> {
1098        if !self.prune_before_cycle_counting {
1099            self.chain_pruner
1100                .prune_low_weight_chains(rt_graph.get_base_graph_mut())
1101        }
1102
1103        if self.debug_graph_transformations {
1104            self.print_debug_graph_transform_abstract(
1105                &rt_graph,
1106                format!(
1107                    "{}_{}-{}-sequenceGraph.{}.0.1.chain_pruned_readthreading_graph.dot",
1108                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1109                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1110                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1111                    kmer_size
1112                ),
1113            );
1114        };
1115
1116        // look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if
1117        // we can recover them by merging some N bases from the chain back into the reference
1118        if self.recover_dangling_branches {
1119            rt_graph.recover_dangling_tails(
1120                self.prune_factor as usize,
1121                self.min_dangling_branch_length,
1122                self.recover_all_dangling_branches,
1123                dangling_end_sw_parameters,
1124            );
1125            rt_graph.recover_dangling_heads(
1126                self.prune_factor as usize,
1127                self.min_dangling_branch_length,
1128                self.recover_all_dangling_branches,
1129                dangling_end_sw_parameters,
1130            );
1131        }
1132
1133        // remove all heading and trailing paths
1134        if self.remove_paths_not_connected_to_ref {
1135            rt_graph.remove_paths_not_connected_to_ref()
1136        }
1137
1138        if self.debug_graph_transformations {
1139            self.print_debug_graph_transform_abstract(
1140                &rt_graph,
1141                format!(
1142                    "{}_{}-{}-sequenceGraph.{}.0.2.cleaned_readthreading_graph.dot",
1143                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1144                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1145                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1146                    kmer_size
1147                ),
1148            );
1149        };
1150
1151        // Either return an assembly result with a sequence graph or with an unchanged
1152        // sequence graph deptending on the kmer duplication behavior
1153        if self.generate_seq_graph {
1154            let mut initial_seq_graph = rt_graph.to_sequence_graph();
1155
1156            if self.debug_graph_transformations {
1157                rt_graph.print_graph(
1158                    format!(
1159                        "{}_{}-{}-sequenceGraph.{}.0.3.initial_seqgraph.dot",
1160                        ref_haplotype.genome_location.as_ref().unwrap().tid(),
1161                        ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1162                        ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1163                        kmer_size
1164                    ),
1165                    10000,
1166                );
1167            };
1168
1169            // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
1170            if self.just_return_raw_graph {
1171                return AssemblyResult::new(
1172                    Status::AssembledSomeVariation,
1173                    Some(initial_seq_graph),
1174                    None,
1175                );
1176            }
1177
1178            // debug!(
1179            //     "Using kmer size of {} in read threading assembler",
1180            //     &rt_graph.get_kmer_size()
1181            // );
1182
1183            if self.debug_graph_transformations {
1184                self.print_debug_graph_transform_abstract(
1185                    &rt_graph,
1186                    format!(
1187                        "{}_{}-{}-sequenceGraph.{}.0.4.initial_seqgraph.dot",
1188                        ref_haplotype.genome_location.as_ref().unwrap().tid(),
1189                        ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1190                        ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1191                        kmer_size
1192                    ),
1193                );
1194            };
1195
1196            initial_seq_graph.base_graph.clean_non_ref_paths();
1197            let cleaned: AssemblyResult<SimpleInterval, ReadThreadingGraph> =
1198                self.clean_up_seq_graph(initial_seq_graph, &ref_haplotype);
1199            let status = cleaned.status;
1200            return AssemblyResult::new(status, cleaned.graph, Some(rt_graph));
1201        } else {
1202            // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
1203            if self.just_return_raw_graph {
1204                return AssemblyResult::new(Status::AssembledSomeVariation, None, Some(rt_graph));
1205            }
1206
1207            // debug!(
1208            //     "Using kmer size of {} in read threading assembler",
1209            //     &rt_graph.get_kmer_size()
1210            // );
1211            let cleaned = Self::get_result_set_for_rt_graph(rt_graph);
1212            return cleaned;
1213        }
1214    }
1215
1216    fn get_result_set_for_rt_graph<A: AbstractReadThreadingGraph>(
1217        rt_graph: A,
1218    ) -> AssemblyResult<SimpleInterval, A> {
1219        // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd.  Can
1220        // happen in cases where for example the reference somehow manages to acquire a cycle, or
1221        // where the entire assembly collapses back into the reference sequence.
1222        if rt_graph.get_reference_source_vertex().is_none()
1223            || rt_graph.get_reference_sink_vertex().is_none()
1224        {
1225            return AssemblyResult::new(Status::JustAssembledReference, None, Some(rt_graph));
1226        };
1227
1228        return AssemblyResult::new(Status::AssembledSomeVariation, None, Some(rt_graph));
1229    }
1230
1231    // Performs the various transformations necessary on a sequence graph
1232    fn clean_up_seq_graph<A: AbstractReadThreadingGraph>(
1233        &self,
1234        mut seq_graph: SeqGraph<BaseEdgeStruct>,
1235        ref_haplotype: &Haplotype<SimpleInterval>,
1236    ) -> AssemblyResult<SimpleInterval, A> {
1237        if self.debug_graph_transformations {
1238            self.print_debug_graph_transform_seq_graph(
1239                &seq_graph,
1240                format!(
1241                    "{}_{}-{}-sequenceGraph.{}.1.0.non_ref_removed.dot",
1242                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1243                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1244                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1245                    seq_graph.base_graph.get_kmer_size()
1246                ),
1247            );
1248        };
1249
1250        // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
1251        seq_graph.zip_linear_chains();
1252        if self.debug_graph_transformations {
1253            self.print_debug_graph_transform_seq_graph(
1254                &seq_graph,
1255                format!(
1256                    "{}_{}-{}-sequenceGraph.{}.1.1.zipped.dot",
1257                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1258                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1259                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1260                    seq_graph.base_graph.get_kmer_size()
1261                ),
1262            );
1263        };
1264
1265        // now go through and prune the graph, removing vertices no longer connected to the reference chain
1266        seq_graph.base_graph.remove_singleton_orphan_vertices();
1267        seq_graph
1268            .base_graph
1269            .remove_vertices_not_connected_to_ref_regardless_of_edge_direction();
1270
1271        if self.debug_graph_transformations {
1272            self.print_debug_graph_transform_seq_graph(
1273                &seq_graph,
1274                format!(
1275                    "{}_{}-{}-sequenceGraph.{}.1.2.pruned.dot",
1276                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1277                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1278                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1279                    seq_graph.base_graph.get_kmer_size()
1280                ),
1281            );
1282        };
1283
1284        seq_graph.simplify_graph(&format!(
1285            "{}_{}-{}.0",
1286            ref_haplotype.genome_location.as_ref().unwrap().tid(),
1287            ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1288            ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1289        ));
1290        if self.debug_graph_transformations {
1291            self.print_debug_graph_transform_seq_graph(
1292                &seq_graph,
1293                format!(
1294                    "{}_{}-{}-sequenceGraph.{}.1.3.merged.dot",
1295                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1296                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1297                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1298                    seq_graph.base_graph.get_kmer_size()
1299                ),
1300            );
1301        };
1302
1303        // The graph has degenerated in some way, so the reference source and/or sink cannot be id'd.  Can
1304        // happen in cases where for example the reference somehow manages to acquire a cycle, or
1305        // where the entire assembly collapses back into the reference sequence.
1306        if seq_graph.base_graph.get_reference_source_vertex().is_none()
1307            || seq_graph.base_graph.get_reference_sink_vertex().is_none()
1308        {
1309            return AssemblyResult::new(Status::JustAssembledReference, Some(seq_graph), None);
1310        };
1311
1312        seq_graph.base_graph.remove_paths_not_connected_to_ref();
1313        seq_graph.simplify_graph(&format!(
1314            "{}_{}-{}.1",
1315            ref_haplotype.genome_location.as_ref().unwrap().tid(),
1316            ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1317            ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1318        ));
1319        if seq_graph.base_graph.graph.node_indices().count() == 1 {
1320            // we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
1321            // the code from blowing up.
1322            // TODO -- ref properties should really be on the vertices, not the graph itself
1323            let complete = seq_graph.base_graph.graph.node_indices().next().unwrap();
1324            let dummy = SeqVertex::new(Vec::new());
1325            let dummy_index = seq_graph.base_graph.add_node(&dummy);
1326            seq_graph.base_graph.graph.add_edge(
1327                complete,
1328                dummy_index,
1329                BaseEdgeStruct::new(true, 0, 0),
1330            );
1331        };
1332
1333        if self.debug_graph_transformations {
1334            self.print_debug_graph_transform_seq_graph(
1335                &seq_graph,
1336                format!(
1337                    "{}_{}-{}-sequenceGraph.{}.1.4.final.dot",
1338                    ref_haplotype.genome_location.as_ref().unwrap().tid(),
1339                    ref_haplotype.genome_location.as_ref().unwrap().get_start(),
1340                    ref_haplotype.genome_location.as_ref().unwrap().get_end(),
1341                    seq_graph.base_graph.get_kmer_size(),
1342                ),
1343            );
1344        };
1345
1346        return AssemblyResult::new(Status::AssembledSomeVariation, Some(seq_graph), None);
1347    }
1348
1349    // fn print_seq_graphs(&self, graphs: &Vec<SeqGraph<BaseEdgeStruct>>) {
1350    //     let _write_first_graph_with_size_smaller_than = 50;
1351
1352    //     for (idx, graph) in graphs.iter().enumerate() {
1353    //         graph.base_graph.print_graph(
1354    //             &(self.graph_output_path.as_ref().unwrap().to_string() + &idx.to_string()),
1355    //             false,
1356    //             self.prune_factor as usize,
1357    //         )
1358    //     }
1359    // }
1360}
lorikeet_genome/read_threading/read_threading_assembler.rs

lorikeet_genome/read_threading/
read_threading_assembler.rs