lorikeet_genome/graphs/
base_graph.rs

1use hashlink::LinkedHashSet;
2use petgraph::prelude::{EdgeIndex, NodeIndex};
3use petgraph::stable_graph::{Externals, StableDiGraph};
4use petgraph::visit::EdgeRef;
5use petgraph::Direction;
6use petgraph::{algo, Directed};
7use std::cmp::Ordering;
8use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
9use std::fs::File;
10use std::hash::Hash;
11use std::io::Write;
12
13use crate::read_threading::multi_debruijn_vertex::MultiDeBruijnVertex;
14use crate::graphs::base_edge::{BaseEdge, BaseEdgeStruct};
15use crate::graphs::base_vertex::BaseVertex;
16use crate::graphs::path::Path;
17use crate::graphs::seq_graph::SeqGraph;
18use crate::graphs::seq_vertex::SeqVertex;
19use crate::utils::base_utils::BaseUtils;
20
21/**
22 * Common code for graphs used for local assembly.
23 */
24#[derive(Debug, Clone)]
25pub struct BaseGraph<V: BaseVertex + Hash, E: BaseEdge> {
26    kmer_size: usize,
27    pub graph: StableDiGraph<V, E>,
28}
29
30impl<V: BaseVertex + Hash, E: BaseEdge> Eq for BaseGraph<V, E> {}
31
32impl<V: BaseVertex + Hash, E: BaseEdge> PartialEq for BaseGraph<V, E> {
33    fn eq(&self, other: &Self) -> bool {
34        self.graph_equals(other)
35    }
36}
37
38impl<V: BaseVertex + Hash, E: BaseEdge> BaseGraph<V, E> {
39    pub fn new(kmer_size: usize) -> BaseGraph<V, E> {
40        BaseGraph {
41            kmer_size,
42            graph: StableDiGraph::<V, E>::new(),
43        }
44    }
45
46    /**
47     * Convert this kmer graph to a simple sequence graph.
48     *
49     * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
50     * graph.  Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
51     *
52     * @return a newly allocated SequenceGraph
53     */
54    pub fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct> {
55        let mut seq_graph = SeqGraph::new(self.kmer_size);
56        let mut vertex_map = HashMap::new();
57
58        // create all of the equivalent seq graph vertices
59        for dv in self.graph.node_indices() {
60            let v_weight = self.graph.node_weight(dv).unwrap();
61            let mut sv = SeqVertex::new(
62                v_weight
63                    .get_additional_sequence(self.is_source(dv))
64                    .to_vec(),
65            );
66            sv.set_additional_info(v_weight.get_additional_info());
67            let sv_ind = seq_graph.base_graph.add_node(&sv);
68            vertex_map.insert(dv, sv_ind);
69        }
70
71        // walk through the nodes and connect them to their equivalent seq vertices
72        for e in self.graph.edge_indices() {
73            let seq_in_v = *vertex_map.get(&self.get_edge_source(e)).unwrap();
74            let seq_out_v = *vertex_map.get(&self.get_edge_target(e)).unwrap();
75            let e_weight = self.graph.edge_weight(e).unwrap();
76            seq_graph.base_graph.graph.add_edge(
77                seq_in_v,
78                seq_out_v,
79                BaseEdgeStruct::new(e_weight.is_ref(), e_weight.get_multiplicity(), 0),
80            );
81        }
82
83        return seq_graph;
84    }
85
86    pub fn vertex_set(&self) -> HashSet<&V> {
87        self.graph.node_weights().collect::<HashSet<&V>>()
88    }
89
90    pub fn edge_set(&self) -> HashSet<&E> {
91        self.graph.edge_weights().collect::<HashSet<&E>>()
92    }
93
94    pub fn edges_directed(&self, node: NodeIndex, direction: Direction) -> Vec<EdgeIndex> {
95        self.graph
96            .edges_directed(node, direction)
97            .map(|e| e.id())
98            .collect::<Vec<EdgeIndex>>()
99    }
100
101    /**
102     * Checks whether the graph contains all the vertices in a collection.
103     *
104     * @param vertices the vertices to check. Must not be null and must not contain a null.
105     *
106     * @throws IllegalArgumentException if {@code vertices} is {@code null}.
107     *
108     * @return {@code true} if all the vertices in the input collection are present in this graph.
109     * Also if the input collection is empty. Otherwise it returns {@code false}.
110     */
111    pub fn contains_all_vertices<'a, I: IntoIterator<Item = &'a NodeIndex>>(
112        &self,
113        nodes: I,
114    ) -> bool {
115        return nodes.into_iter().all(|v| self.graph.contains_node(*v));
116    }
117
118    pub fn contains_edge(&self, edge: EdgeIndex) -> bool {
119        self.graph.edge_endpoints(edge).is_some()
120    }
121
122    pub fn get_kmer_size(&self) -> usize {
123        self.kmer_size
124    }
125
126    /**
127     * @param v the vertex to test
128     * @return  true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
129     */
130    pub fn is_reference_node(&self, vertex_index: NodeIndex) -> bool {
131        if self
132            .graph
133            .edges_directed(vertex_index, Direction::Incoming)
134            .into_iter()
135            .any(|e| e.weight().is_ref())
136            || self
137                .graph
138                .edges_directed(vertex_index, Direction::Outgoing)
139                .into_iter()
140                .any(|e| e.weight().is_ref())
141        {
142            return true;
143        } else {
144            self.vertex_set().len() == 1
145        }
146    }
147
148    /**
149     * @param v the vertex to test
150     * @return  true if this vertex is a source node (in degree == 0)
151     */
152    pub fn is_source(&self, vertex_index: NodeIndex) -> bool {
153        return self.in_degree_of(vertex_index) == 0;
154    }
155
156    /**
157     * @param v the vertex to test
158     * @return  true if this vertex is a sink node (out degree == 0)
159     */
160    pub fn is_sink(&self, veretex_index: NodeIndex) -> bool {
161        return self.out_degree_of(veretex_index) == 0;
162    }
163
164    /**
165     * @param v the vertex to test
166     * @return number of incoming edges
167     */
168    pub fn in_degree_of(&self, vertex_index: NodeIndex) -> usize {
169        self.graph
170            .edges_directed(vertex_index, Direction::Incoming)
171            .count()
172    }
173
174    /**
175     * @param v the vertex to test
176     * @return number of outgoing edges
177     */
178    pub fn out_degree_of(&self, vertex_index: NodeIndex) -> usize {
179        self.graph
180            .edges_directed(vertex_index, Direction::Outgoing)
181            .count()
182    }
183
184    /**
185     * Get the set of source vertices of this graph
186     * NOTE: We return a BTreeSet here in order to preserve the determinism in the output order of VertexSet(),
187     *       which is deterministic in output due to the underlying sets all being BTreeSet
188     * @return a non-null set
189     */
190    pub fn get_sources(&self) -> VecDeque<NodeIndex> {
191        return self
192            .graph
193            .externals(Direction::Incoming)
194            .collect::<VecDeque<NodeIndex>>();
195    }
196
197    pub fn get_sources_generic(&self) -> Externals<'_, V, Directed, u32> {
198        return self.graph.externals(Direction::Incoming);
199    }
200
201    /**
202     * Get the set of sink vertices of this graph
203     * NOTE: We return a BTreeSet here in order to preserve the determinism in the output order of VertexSet(),
204     *       which is deterministic in output due to the underlying sets all being BTreeSet
205     * @return a non-null set
206     */
207    pub fn get_sinks(&self) -> BinaryHeap<NodeIndex> {
208        return self
209            .graph
210            .externals(Direction::Outgoing)
211            .collect::<BinaryHeap<NodeIndex>>();
212    }
213
214    pub fn get_sinks_generic(&self) -> Externals<'_, V, Directed, u32> {
215        return self.graph.externals(Direction::Outgoing);
216    }
217
218    pub fn compare_paths(&self, first_path: &Path, second_path: &Path) -> Ordering {
219        return BaseUtils::bases_comparator(
220            &first_path.get_bases(&self),
221            &second_path.get_bases(&self),
222        );
223    }
224
225    /**
226     * Get the set of vertices connected to v by incoming edges
227     * NOTE: We return a HashSet here in order to preserve the determinism in the output order of VertexSet(),
228     *       which is deterministic in output due to the underlying sets all being HashSets.
229     * @param v a non-null vertex
230     * @return a set of vertices {X} connected X -> v
231     */
232    pub fn incoming_vertices_of(&self, v: NodeIndex) -> LinkedHashSet<NodeIndex> {
233        return self
234            .graph
235            .neighbors_directed(v, Direction::Incoming)
236            .collect::<LinkedHashSet<NodeIndex>>();
237    }
238
239    /**
240     * Get the set of vertices connected to v by outgoing edges
241     * NOTE: We return a HashSet here in order to preserve the determinism in the output order of VertexSet(),
242     *       which is deterministic in output due to the underlying sets all being HashSets.
243     * @param v a non-null vertex
244     * @return a set of vertices {X} connected X -> v
245     */
246    pub fn outgoing_vertices_of(&self, v: NodeIndex) -> LinkedHashSet<NodeIndex> {
247        return self
248            .graph
249            .neighbors_directed(v, Direction::Outgoing)
250            .collect::<LinkedHashSet<NodeIndex>>();
251    }
252
253    /**
254     * Get the incoming edge of v.  Requires that there be only one such edge or throws an error
255     * @param v our vertex
256     * @return the single incoming edge to v, or null if none exists
257     */
258    pub fn incoming_edge_of(&self, v: NodeIndex) -> Option<EdgeIndex> {
259        self.get_directed_singleton_edge(v, Direction::Incoming)
260    }
261
262    /**
263     * Get the incoming edge of v.  Requires that there be only one such edge or throws an error
264     * @param v our vertex
265     * @return the single incoming edge to v, or null if none exists
266     */
267    pub fn outgoing_edge_of(&self, v: NodeIndex) -> Option<EdgeIndex> {
268        self.get_directed_singleton_edge(v, Direction::Outgoing)
269    }
270
271    fn get_directed_singleton_edge(&self, v: NodeIndex, direction: Direction) -> Option<EdgeIndex> {
272        self.get_singleton_edge(
273            self.graph
274                .edges_directed(v, direction)
275                .map(|e| e.id())
276                .collect::<Vec<EdgeIndex>>(),
277        )
278    }
279
280    /**
281     * Helper function that gets the a single edge from edges, null if edges is empty, or
282     * throws an error is edges has more than 1 element
283     * @param edges a set of edges
284     * @return a edge
285     */
286    fn get_singleton_edge(&self, edges: Vec<EdgeIndex>) -> Option<EdgeIndex> {
287        assert!(
288            edges.len() <= 1,
289            "Cannot get a single incoming edge for a vertex with multiple incoming edges {:?}",
290            edges
291        );
292        return if edges.is_empty() {
293            None
294        } else {
295            Some(edges[0])
296        };
297    }
298
299    pub fn get_edge_target(&self, edge: EdgeIndex) -> NodeIndex {
300        self.graph.edge_endpoints(edge).unwrap().1
301    }
302
303    pub fn get_edge_source(&self, edge: EdgeIndex) -> NodeIndex {
304        self.graph.edge_endpoints(edge).unwrap().0
305    }
306
307    pub fn edge_is_ref(&self, edge: EdgeIndex) -> bool {
308        self.graph.edge_weight(edge).unwrap().is_ref()
309    }
310
311    /**
312     * Removes all provided vertices from the graph
313     */
314    pub fn remove_all_vertices<'a, I>(&'a mut self, vertices: I)
315    where
316        I: IntoIterator<Item = &'a NodeIndex>,
317    {
318        for vertex in vertices.into_iter() {
319            self.graph.remove_node(*vertex);
320        }
321    }
322
323    /**
324     * Removes all provided edges from the graph
325     */
326    pub fn remove_all_edges<'a, I>(&'a mut self, edges: I)
327    where
328        I: IntoIterator<Item = &'a EdgeIndex>,
329    {
330        for edge in edges.into_iter() {
331            self.graph.remove_edge(*edge);
332        }
333    }
334
335    /**
336     * @param v the vertex to test
337     * @return  true if this vertex is a reference source
338     */
339    pub fn is_ref_source(&self, v: NodeIndex) -> bool {
340        // confirm that no incoming edges are reference edges
341        if self
342            .graph
343            .edges_directed(v, Direction::Incoming)
344            .any(|e| e.weight().is_ref())
345        {
346            return false;
347        }
348
349        // confirm that there is an outgoing reference edge
350        if self
351            .graph
352            .edges_directed(v, Direction::Outgoing)
353            .any(|e| e.weight().is_ref())
354        {
355            return true;
356        }
357
358        // edge case: if the graph only has one node then it's a ref source, otherwise it's not
359        return self.graph.node_indices().count() == 1;
360    }
361
362    /**
363     * @param v the vertex to test
364     * @return  true if this vertex is a reference source
365     */
366    pub fn is_ref_sink(&self, v: NodeIndex) -> bool {
367        // confirm that no incoming edges are reference edges
368        if self
369            .graph
370            .edges_directed(v, Direction::Outgoing)
371            .any(|e| e.weight().is_ref())
372        {
373            return false;
374        }
375
376        // confirm that there is an outgoing reference edge
377        if self
378            .graph
379            .edges_directed(v, Direction::Incoming)
380            .any(|e| e.weight().is_ref())
381        {
382            return true;
383        }
384
385        // edge case: if the graph only has one node then it's a ref source, otherwise it's not
386        return self.graph.node_indices().count() == 1;
387    }
388
389    /**
390     * Remove all vertices in the graph that have in and out degree of 0
391     */
392    pub fn remove_singleton_orphan_vertices(&mut self) {
393        // Run through the graph and clean up singular orphaned nodes
394        //Note: need to collect nodes to remove first because we can't directly modify the list we're iterating over
395        let to_remove = self
396            .graph
397            .node_indices()
398            .filter(|v| self.is_singleton_orphan(*v))
399            .collect::<HashSet<NodeIndex>>();
400        // debug!("Orphans {}", to_remove.len());
401        self.remove_all_vertices(&to_remove)
402    }
403
404    fn is_singleton_orphan(&self, v: NodeIndex) -> bool {
405        return self.in_degree_of(v) == 0 && self.out_degree_of(v) == 0 && !self.is_ref_source(v);
406    }
407
408    /**
409     * @return the reference source vertex pulled from the graph, can be None if it doesn't exist in the graph
410     */
411    pub fn get_reference_source_vertex(&self) -> Option<NodeIndex> {
412        return self
413            .graph
414            .node_indices()
415            .filter(|v| self.is_ref_source(*v))
416            .next();
417    }
418
419    /**
420     * @return the reference sink vertex pulled from the graph, can be None if it doesn't exist in the graph
421     */
422    pub fn get_reference_sink_vertex(&self) -> Option<NodeIndex> {
423        return self
424            .graph
425            .node_indices()
426            .filter(|v| self.is_ref_sink(*v))
427            .next();
428    }
429
430    /**
431     * Traverse the graph and get the next reference vertex if it exists
432     * @param v the current vertex, can be null
433     * @param allowNonRefPaths if true, allow sub-paths that are non-reference if there is only a single outgoing edge
434     * @param blacklistedEdge optional edge to ignore in the traversal down; useful to exclude the non-reference dangling paths
435     * @return the next vertex (but not necessarily on the reference path if allowNonRefPaths is true) if it exists, otherwise null
436     */
437    pub fn get_next_reference_vertex(
438        &self,
439        v: Option<NodeIndex>,
440        allow_non_ref_paths: bool,
441        blacklisted_edge: Option<EdgeIndex>,
442    ) -> Option<NodeIndex> {
443        if v.is_none() {
444            return None;
445        }
446
447        let v = v.unwrap();
448        let outgoing_edges = self.graph.edges_directed(v, Direction::Outgoing);
449
450        for edge_to_test in outgoing_edges {
451            if edge_to_test.weight().is_ref() {
452                return Some(self.get_edge_target(edge_to_test.id()));
453            }
454        }
455
456        if !allow_non_ref_paths {
457            return None;
458        }
459
460        //singleton or empty set
461        let outgoing_edges = self.graph.edges_directed(v, Direction::Outgoing);
462        let edges = match blacklisted_edge {
463            Some(blacklisted_edge) => {
464                let edges = outgoing_edges
465                    .filter(|e| e.id() != blacklisted_edge)
466                    .map(|e| e.id())
467                    .collect::<Vec<EdgeIndex>>();
468                edges
469            }
470            None => {
471                let edges = outgoing_edges.map(|e| e.id()).collect::<Vec<EdgeIndex>>();
472                edges
473            }
474        };
475
476        return if edges.len() == 1 {
477            Some(self.get_edge_target(edges[0]))
478        } else {
479            None
480        };
481    }
482
483    /**
484     * Traverse the graph and get the previous reference vertex if it exists
485     * @param v the current vertex, can be null
486     * @return  the previous reference vertex if it exists or null otherwise.
487     */
488    pub fn get_prev_reference_vertex(&self, v: Option<NodeIndex>) -> Option<NodeIndex> {
489        match v {
490            None => return None,
491            Some(v) => self
492                .graph
493                .edges_directed(v, Direction::Incoming)
494                .map(|e| self.get_edge_source(e.id()))
495                .filter(|v| self.is_reference_node(*v))
496                .take(1)
497                .next(),
498        }
499    }
500
501    /**
502     * Print out the graph in the dot language for visualization
503     * @param destination File to write to
504     */
505    pub fn print_graph(&self, destination: &str, write_header: bool, prune_factor: usize) {
506        let mut graph_writer = File::create(destination).unwrap();
507
508        if write_header {
509            graph_writer.write(b"digraph assemblyGraphs {\n").expect("Failed to write to file");
510        };
511
512        for edge in self.graph.edge_indices() {
513            let edge_string = format!(
514                "\t{} -> {} ",
515                self.get_edge_source(edge).index(),
516                self.get_edge_target(edge).index()
517            );
518            let edge_label_string;
519            let edge_weight = self.graph.edge_weight(edge).unwrap();
520            if edge_weight.get_multiplicity() > 0 && edge_weight.get_multiplicity() < prune_factor {
521                edge_label_string = format!(
522                    "[style=dotted,color=grey,label=\"{}\"];",
523                    edge_weight.get_dot_label()
524                );
525            } else {
526                edge_label_string = format!("[label=\"{}\"];", edge_weight.get_dot_label());
527            }
528            graph_writer.write(edge_string.as_bytes()).expect("Failed to write to file");
529            graph_writer.write(edge_label_string.as_bytes()).expect("Failed to write to file");
530            if edge_weight.is_ref() {
531                graph_writer.write(format!("{} [color=red];\n", edge_string).as_bytes()).expect("Failed to write to file");
532            }
533        }
534
535        for v in self.graph.node_indices() {
536            let node_weight = self.graph.node_weight(v).unwrap();
537            graph_writer.write(
538                format!(
539                    "\t{} [label=\"{}\",shape=box]\n",
540                    node_weight.to_string(),
541                    format!(
542                        "{}{}",
543                        std::str::from_utf8(self.get_additional_sequence(v, node_weight)).unwrap(),
544                        node_weight.get_additional_info()
545                    )
546                )
547                .as_bytes(),
548            ).expect("Failed to write to file");
549        }
550
551        if write_header {
552            graph_writer.write(b"}\n").expect("Failed to write to file");
553        };
554    }
555
556    /**
557     * Pull out the additional sequence implied by traversing this node in the graph
558     * @param v the vertex from which to pull out the additional base sequence
559     * @return  non-null byte array
560     */
561    pub fn get_additional_sequence<'a>(&self, index: NodeIndex, v: &'a V) -> &'a [u8] {
562        return v.get_additional_sequence(self.is_source(index));
563    }
564
565    pub fn get_sequence_from_index<'a>(&'a self, index: NodeIndex) -> &'a [u8] {
566        return self
567            .graph
568            .node_weight(index)
569            .unwrap()
570            .get_additional_sequence(self.is_source(index));
571    }
572
573    /**
574     * Get the set of vertices within distance edges of source, regardless of edge direction
575     *
576     * @param source the source vertex to consider
577     * @param distance the distance
578     * @return a set of vertices within distance of source
579     */
580    fn vertices_within_distance(&self, source: NodeIndex, distance: usize) -> HashSet<NodeIndex> {
581        if distance == 0 {
582            return vec![source].into_iter().collect::<HashSet<NodeIndex>>();
583        }
584
585        let mut found = HashSet::new();
586        found.insert(source);
587        for v in self.graph.neighbors(source) {
588            found.extend(self.vertices_within_distance(v, distance - 1))
589        }
590
591        return found;
592    }
593
594    /**
595     * Get a graph containing only the vertices within distance edges of target
596     * @param target a vertex in graph
597     * @param distance the max distance
598     * @return a non-null graph
599     */
600    pub fn subset_to_neighbours(&self, target: NodeIndex, distance: usize) -> BaseGraph<V, E> {
601        let to_keep = self.vertices_within_distance(target, distance);
602        let mut result = self.graph.clone();
603        result.retain_nodes(|_gr, v| to_keep.contains(&v));
604        BaseGraph {
605            kmer_size: self.kmer_size,
606            graph: result,
607        }
608    }
609
610    /**
611     * Checks for the presence of directed cycles in the graph.
612     *
613     * @return {@code true} if the graph has cycles, {@code false} otherwise.
614     */
615    pub fn has_cycles(&self) -> bool {
616        algo::is_cyclic_directed(&self.graph)
617    }
618
619    /**
620     * Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex
621     *
622     * More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection,
623     * as it requires vertices to not only be connected by a series of directed edges but also prunes away
624     * paths that do not also meet eventually with the reference sink vertex
625     */
626    pub fn remove_paths_not_connected_to_ref(&mut self) {
627        if self.get_reference_source_vertex().is_none()
628            || self.get_reference_sink_vertex().is_none()
629        {
630            panic!("Graph must have ref source and sink vertices");
631        }
632
633        // get the set of vertices we can reach by going forward from the ref source
634        let mut on_path_from_ref_source = self.get_all_connected_nodes_from_node(
635            self.get_reference_source_vertex().unwrap(),
636            Direction::Outgoing,
637        );
638
639        // get the set of vertices we can reach by going backward from the ref sink
640        let on_path_from_ref_sink = self.get_all_connected_nodes_from_node(
641            self.get_reference_sink_vertex().unwrap(),
642            Direction::Incoming,
643        );
644
645        // we want to remove anything that's not in both the sink and source sets
646        let mut vertices_to_remove = self.graph.node_indices().collect::<HashSet<NodeIndex>>();
647        on_path_from_ref_source.retain(|v| on_path_from_ref_sink.contains(v));
648        vertices_to_remove.retain(|v| !on_path_from_ref_source.contains(v));
649        // let not_in_both = on_path_from_ref_source
650        //     .symmetric_difference(&on_path_from_ref_sink)
651        //     .map(|v| *v)
652        //     .collect::<HashSet<NodeIndex>>();
653
654        // vertices_to_remove.retain(|v| not_in_both.contains(v));
655        self.remove_all_vertices(&vertices_to_remove);
656
657        // simple sanity checks that this algorithm is working.
658        if self.get_sinks().len() > 1 {
659            panic!(
660                "Should have eliminated all but the reference sink, but found {:?}",
661                self.get_sinks()
662            );
663        };
664
665        if self.get_sources().len() > 1 {
666            panic!(
667                "Should have eliminated all but the reference sink, but found {:?}",
668                self.get_sources()
669            );
670        };
671    }
672
673    /**
674     * Iterate through the BaseGraph and return all vertices reachable by specified vertex in given
675     * direction
676     *
677     * Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the
678     * start vertex
679     *
680     * @param graph the graph to iterator over.  Cannot be null
681     * @param start the vertex to start at.  Cannot be null
682     * @param followIncomingEdges should we follow incoming edges during our
683     *                            traversal? (goes backward through the graph)
684     * @param followOutgoingEdges should we follow outgoing edges during out traversal?
685     */
686    fn get_all_connected_nodes_from_node(
687        &self,
688        starting_node: NodeIndex,
689        direction: Direction,
690    ) -> HashSet<NodeIndex> {
691        let mut to_visit = BinaryHeap::new();
692        let mut visited = HashSet::new();
693        to_visit.push(starting_node);
694
695        let mut v;
696        while !to_visit.is_empty() {
697            v = to_visit.pop();
698            if !visited.contains(&v.unwrap()) {
699                visited.insert(v.unwrap());
700                to_visit.extend(
701                    self.graph
702                        .neighbors_directed(v.unwrap(), direction)
703                        .collect::<Vec<NodeIndex>>(),
704                );
705            }
706        }
707
708        return visited;
709    }
710
711    /**
712     * Remove edges that are connected before the reference source and after the reference sink
713     *
714     * Also removes all vertices that are orphaned by this process
715     */
716    pub fn clean_non_ref_paths(&mut self) {
717        if self.get_reference_source_vertex().is_none()
718            || self.get_reference_sink_vertex().is_none()
719        {
720            // pass
721        } else {
722            // Remove non-ref edges connected before and after the reference path
723            let mut edges_to_check: BinaryHeap<EdgeIndex> = BinaryHeap::new();
724            edges_to_check.extend(
725                self.graph
726                    .edges_directed(
727                        self.get_reference_source_vertex().unwrap(),
728                        Direction::Incoming,
729                    )
730                    .map(|edge_ref| edge_ref.id()),
731            );
732
733            while !edges_to_check.is_empty() {
734                let e = edges_to_check.pop().unwrap();
735                if !self.graph.edge_weight(e).unwrap().is_ref() {
736                    edges_to_check.extend(
737                        self.graph
738                            .edges_directed(self.get_edge_source(e), Direction::Incoming)
739                            .map(|edge_ref| edge_ref.id()),
740                    );
741                    self.graph.remove_edge(e);
742                }
743            }
744
745            edges_to_check.extend(
746                self.graph
747                    .edges_directed(
748                        self.get_reference_sink_vertex().unwrap(),
749                        Direction::Outgoing,
750                    )
751                    .map(|edge_ref| edge_ref.id()),
752            );
753
754            while !edges_to_check.is_empty() {
755                let e = edges_to_check.pop().unwrap();
756                if !self.graph.edge_weight(e).unwrap().is_ref() {
757                    edges_to_check.extend(
758                        self.graph
759                            .edges_directed(self.get_edge_target(e), Direction::Outgoing)
760                            .map(|edge_ref| edge_ref.id()),
761                    );
762                    self.graph.remove_edge(e);
763                }
764            }
765
766            self.remove_singleton_orphan_vertices();
767        }
768    }
769
770    /**
771     * Remove all vertices on the graph that cannot be accessed by following any edge,
772     * regardless of its direction, from the reference source vertex
773     */
774    pub fn remove_vertices_not_connected_to_ref_regardless_of_edge_direction(&mut self) {
775        let mut to_remove = self.graph.node_indices().collect::<HashSet<NodeIndex>>();
776
777        let ref_v = self.get_reference_source_vertex();
778        match ref_v {
779            None => self.remove_all_vertices(&to_remove),
780            Some(ref_v) => {
781                for node in self.get_all_connected_nodes_from_node(ref_v, Direction::Incoming) {
782                    to_remove.remove(&node);
783                }
784
785                for node in self.get_all_connected_nodes_from_node(ref_v, Direction::Outgoing) {
786                    to_remove.remove(&node);
787                }
788                to_remove.remove(&ref_v);
789
790                self.remove_all_vertices(&to_remove);
791            }
792        }
793    }
794
795    /**
796     * Walk along the reference path in the graph and pull out the corresponding bases
797     * @param fromVertex    starting vertex
798     * @param toVertex      ending vertex
799     * @param includeStart  should the starting vertex be included in the path
800     * @param includeStop   should the ending vertex be included in the path
801     * @return              byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
802     */
803    pub fn get_reference_bytes(
804        &self,
805        from_vertex: NodeIndex,
806        to_vertex: Option<NodeIndex>,
807        include_start: bool,
808        include_stop: bool,
809    ) -> Vec<u8> {
810        let mut bytes = Vec::new();
811        let mut v = Some(from_vertex);
812        if include_start {
813            bytes.extend(
814                self.graph
815                    .node_weight(v.unwrap())
816                    .unwrap()
817                    .get_additional_sequence(true),
818            );
819        };
820
821        v = self.get_next_reference_vertex(v, false, None);
822        while v.is_some() && v != to_vertex {
823            bytes.extend(
824                self.graph
825                    .node_weight(v.unwrap())
826                    .unwrap()
827                    .get_additional_sequence(true),
828            );
829            v = self.get_next_reference_vertex(v, false, None);
830        }
831
832        if include_stop && v.is_some() && v == to_vertex {
833            bytes.extend(
834                self.graph
835                    .node_weight(v.unwrap())
836                    .unwrap()
837                    .get_additional_sequence(true),
838            );
839        };
840
841        return bytes;
842    }
843
844    pub fn add_vertices<'a, I>(&mut self, vertices: I) -> Vec<NodeIndex>
845    where
846        V: BaseVertex + 'a,
847        I: IntoIterator<Item = &'a V>,
848    {
849        let node_indices = vertices
850            .into_iter()
851            .map(|v| self.add_node(v))
852            .collect::<Vec<NodeIndex>>();
853
854        return node_indices;
855    }
856
857    pub fn add_edges(&mut self, start: NodeIndex, remaining: Vec<NodeIndex>, template: E) {
858        let mut prev = start;
859        for next in remaining {
860            self.graph.add_edge(prev, next, template.clone());
861            prev = next;
862        }
863    }
864
865    /**
866     * Gets all node weights in vector of node indices. Returns a vector of Options pointing to references
867     */
868    pub fn get_node_weights(&self, node_indices: &Vec<NodeIndex>) -> Vec<Option<&V>> {
869        node_indices
870            .into_iter()
871            .map(|n| self.graph.node_weight(*n))
872            .collect::<Vec<Option<&V>>>()
873    }
874
875    /**
876     * Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure
877     *
878     * By similar this means that both graphs have the same number of vertices, where each vertex can find
879     * a vertex in the other graph that's seqEqual to it.  A similar constraint applies to the edges,
880     * where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are
881     * seqEqual
882     *
883     * @param g1 the first graph to compare
884     * @param g2 the second graph to compare
885     * @param <T> the type of the nodes in those graphs
886     * @return true if g1 and g2 are equals
887     */
888    pub fn graph_equals(&self, other: &Self) -> bool {
889        let vertices_1 = self.graph.node_indices().collect::<HashSet<NodeIndex>>();
890        let vertices_2 = other.graph.node_indices().collect::<HashSet<NodeIndex>>();
891        let edges_1 = self.graph.edge_indices().collect::<HashSet<EdgeIndex>>();
892        let edges_2 = other.graph.edge_indices().collect::<HashSet<EdgeIndex>>();
893
894        if vertices_1.len() != vertices_2.len() || edges_1.len() != edges_2.len() {
895            return false;
896        }
897
898        if vertices_1.len() == 0
899            && vertices_2.len() == 0
900            && edges_1.len() == 0
901            && edges_2.len() == 0
902        {
903            return true;
904        }
905
906        //for every vertex in g1 there is a vertex in g2 with an equal getSequenceString
907        let ok = vertices_1
908            .iter()
909            .map(|v1| self.graph.node_weight(*v1).unwrap().get_sequence())
910            .all(|v1_seq_string| {
911                vertices_2
912                    .iter()
913                    .any(|v2| v1_seq_string == other.graph.node_weight(*v2).unwrap().get_sequence())
914            });
915        if !ok {
916            return false;
917        }
918
919        //for every edge in g1 there is an equal edge in g2
920        let ok_g1 = edges_1
921            .iter()
922            .all(|e1| edges_2.iter().any(|e2| self.seq_equals(*e1, *e2, &other)));
923        if !ok_g1 {
924            return false;
925        }
926
927        return edges_2
928            .iter()
929            .all(|e2| edges_1.iter().any(|e1| other.seq_equals(*e2, *e1, self)));
930    }
931
932    fn seq_equals(&self, edge_1: EdgeIndex, edge_2: EdgeIndex, other: &Self) -> bool {
933        return self
934            .graph
935            .node_weight(self.get_edge_source(edge_1))
936            .unwrap()
937            .get_sequence()
938            == other
939                .graph
940                .node_weight(other.get_edge_source(edge_2))
941                .unwrap()
942                .get_sequence()
943            && self
944                .graph
945                .node_weight(self.get_edge_target(edge_1))
946                .unwrap()
947                .get_sequence()
948                == other
949                    .graph
950                    .node_weight(other.get_edge_target(edge_2))
951                    .unwrap()
952                    .get_sequence();
953    }
954
955    /**
956     * Add nodes to the graph, if the node is already present in the graph then retrieve and return
957     * its current index. Use this instead of petgraph's usual add_node as we do not want to be able
958     * have identical nodes especially when dealing with k-mers
959     * **Returns** the NodeIndex of the provided vertex/node
960     */
961    pub fn add_node(&mut self, v: &V) -> NodeIndex {
962        if v.merge_identical_nodes() {
963            let index = self.graph.node_indices().find(|i| &self.graph[*i] == v);
964            match index {
965                None => return self.graph.add_node(v.clone()),
966                Some(index) => return index,
967            }
968        } else {
969            return self.graph.add_node(v.clone());
970        }
971    }
972
973    /**
974     * Add edge between source -> target if none exists, or add e to an already existing one if present
975     *
976     * @param source source vertex
977     * @param target vertex
978     * @param e edge to add
979     */
980    pub fn add_or_update_edge(&mut self, source: NodeIndex, target: NodeIndex, e: E) {
981        let prev = self.graph.find_edge(source, target);
982        match prev {
983            None => {
984                self.graph.add_edge(source, target, e);
985            }
986            Some(prev) => {
987                let prev_weight = self.graph.edge_weight_mut(prev).unwrap();
988                prev_weight.add(e);
989            }
990        }
991    }
992
993    /**
994     * Returns the index of a given node if it is found in the graph. None if not found.
995     */
996    pub fn index_from_vertex(&self, v: &V) -> Option<NodeIndex> {
997        let index = self.graph.node_indices().find(|i| &self.graph[*i] == v);
998        return index;
999    }
1000}
1001
1002pub struct TestGraph {
1003    pub graph: BaseGraph<MultiDeBruijnVertex, BaseEdgeStruct>,
1004}
1005
1006impl TestGraph {
1007    pub fn new(kmer_size: usize) -> Self {
1008        Self {
1009            graph: BaseGraph::new(kmer_size),
1010        }
1011    }
1012
1013    /**
1014     * Only used for testing purposes
1015     * Add edge to assembly graph connecting the two kmers
1016     * @param kmer1 the source kmer for the edge
1017     * @param kmer2 the target kmer for the edge
1018     * @param isRef true if the added edge is a reference edge
1019     */
1020    pub fn add_kmers_to_graph(
1021        &mut self,
1022        kmer_1: &[u8],
1023        kmer_2: &[u8],
1024        is_ref: bool,
1025        multiplicity: usize,
1026    ) {
1027        assert_eq!(
1028            kmer_1.len(),
1029            kmer_2.len(),
1030            "Attempting to add kmers of differing lengths"
1031        );
1032        let v1 = MultiDeBruijnVertex::new(kmer_1.to_vec(), true);
1033        let v2 = MultiDeBruijnVertex::new(kmer_2.to_vec(), true);
1034        let to_add = BaseEdgeStruct::new(is_ref, multiplicity, 0);
1035
1036        let node_indices = self.graph.add_vertices(vec![&v1, &v2]);
1037        self.graph
1038            .add_or_update_edge(node_indices[0], node_indices[1], to_add);
1039    }
1040
1041    // /**
1042    //  * Convert this kmer graph to a simple sequence graph.
1043    //  *
1044    //  * Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
1045    //  * graph.  Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
1046    //  *
1047    //  * @return a newly allocated SequenceGraph
1048    //  */
1049    // pub fn to_sequence_graph(&self) -> SeqGraph<BaseEdgeStruct>
1050}