Skip to main content

gbz_base/
subgraph.rs

1//! A subgraph in a GBZ graph.
2//!
3//! This module provides functionality for extracting a subgraph around a specific position or interval of a specific path.
4//! The subgraph contains all nodes within a given context and all edges between them.
5//! It can also be extended to include all nodes in top-level snarls with boundary nodes already in the subgraph.
6//! All other paths within the subgraph can also be extracted, but they will not have any true metadata associated with them.
7
8use crate::{GBZRecord, GBZPath};
9use crate::{GraphInterface, GraphReference};
10use crate::PathIndex;
11use crate::{SubgraphQuery, HaplotypeOutput};
12use crate::subgraph::query::QueryType;
13use crate::formats::{self, WalkMetadata, JSONValue};
14
15use std::cmp::Reverse;
16use std::collections::{BinaryHeap, BTreeMap, BTreeSet, HashSet};
17use std::fmt::Display;
18use std::io::{self, Write};
19use std::iter::FusedIterator;
20use std::ops::Range;
21use std::cmp;
22
23use gbz::ENDMARKER;
24use gbz::{GBZ, GraphPosition, Orientation, NodeSide, Pos, FullPathName};
25use gbz::support::Chains;
26use gbz::{algorithms, support};
27
28use pggname::{Graph, GraphName};
29use pggname::graph::NodeInt;
30
31#[cfg(test)]
32mod tests;
33
34pub mod query;
35
36//-----------------------------------------------------------------------------
37
38/// A subgraph induced by a set of node identifiers.
39///
40/// The subgraph contains the specified nodes and the edges between them.
41/// Paths in the current subgraph can be extracted using [`Subgraph::extract_paths`].
42/// Only the provided reference path will have proper metadata.
43/// Other paths remain anonymous, as we cannot identify them efficiently using the GBWT.
44/// Any changes to the subgraph will remove the extracted paths.
45/// When a subgraph is changed, the operations will try to reuse already extracted node records.
46/// This makes it viable as a sliding window over a reference path.
47///
48/// Node identifiers can be selected using:
49/// * [`Subgraph::add_node_from_gbz`], [`Subgraph::add_node_from_db`], and [`Subgraph::remove_node`] with individual ids.
50/// * [`Subgraph::around_position`] for a context around a graph position.
51/// * [`Subgraph::around_interval`] for a context around path interval.
52/// * [`Subgraph::around_nodes`] for a context around a set of nodes.
53/// * [`Subgraph::between_nodes`] for all nodes and snarls in an interval of a chain.
54/// * [`Subgraph::extract_snarls`] to include all nodes in top-level snarls with boundary nodes in the current subgraph.
55///
56/// [`Subgraph::from_gbz`] and [`Subgraph::from_db`] are integrated methods for extracting a subgraph with paths using a [`SubgraphQuery`].
57///
58/// `Subgraph` implements a similar graph interface to the node/edge operations of [`GBZ`].
59/// It can also be serialized in GFA and JSON formats using [`Subgraph::write_gfa`] and [`Subgraph::write_json`].
60///
61/// [`Subgraph`] also implements [`Graph`] from the [`pggname`].
62/// That enables computing stable graph names using [`pggname::stable_name`].
63///
64/// # Examples
65///
66/// The following example replicates an offset-based [`Subgraph::from_gbz`] query using lower-level functions.
67/// It also demonstrates some parts of the graph interface in the subgraph object.
68///
69/// ```
70/// use gbz_base::{GBZRecord, GraphReference, PathIndex, Subgraph, HaplotypeOutput};
71/// use gbz::{GBZ, FullPathName, Orientation};
72/// use gbz::support;
73/// use simple_sds::serialize;
74///
75/// // Get the graph.
76/// let gbz_file = support::get_test_data("example.gbz");
77/// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
78///
79/// // Create a path index with 3 bp intervals.
80/// let path_index = PathIndex::new(&graph, 3, false).unwrap();
81///
82/// // Find the position for path A offset 2.
83/// let mut query_pos = FullPathName::generic("A");
84/// query_pos.fragment = 2;
85/// let mut subgraph = Subgraph::new();
86/// let result = subgraph.path_pos_from_gbz(&graph, &path_index, &query_pos);
87/// assert!(result.is_ok());
88/// let (path_pos, path_name) = result.unwrap();
89///
90/// // Extract a 1 bp context around the position.
91/// let result = subgraph.around_position(GraphReference::Gbz(&graph), path_pos.graph_pos(), 1);
92/// assert!(result.is_ok());
93///
94/// // Extract all paths in the subgraph.
95/// let result = subgraph.extract_paths(Some((path_pos, path_name)), HaplotypeOutput::All);
96/// assert!(result.is_ok());
97///
98/// // Compute the stable name of the subgraph with no parent graph.
99/// subgraph.compute_name(None);
100/// assert!(subgraph.has_graph_name());
101///
102/// // The subgraph should be centered around 1 bp node 14 of degree 4.
103/// let nodes = [12, 13, 14, 15, 16];
104/// assert_eq!(subgraph.nodes(), nodes.len());
105/// assert!(subgraph.node_iter().eq(nodes.iter().copied()));
106/// assert_eq!(subgraph.paths(), 3);
107///
108/// // The result is a subgraph induced by the nodes.
109/// for &node_id in nodes.iter() {
110///     assert!(subgraph.has_node(node_id));
111///     assert_eq!(subgraph.sequence(node_id), graph.sequence(node_id));
112///     for orientation in [Orientation::Forward, Orientation::Reverse] {
113///         // Successors are present if they are in the subgraph.
114///         let sub_succ = subgraph.successors(node_id, orientation).unwrap();
115///         let graph_succ = graph.successors(node_id, orientation).unwrap();
116///         assert!(sub_succ.eq(graph_succ.filter(|(id, _)| nodes.contains(id))));
117///         // And the same applies to predecessors.
118///         let sub_pred = subgraph.predecessors(node_id, orientation).unwrap();
119///         let graph_pred = graph.predecessors(node_id, orientation).unwrap();
120///         assert!(sub_pred.eq(graph_pred.filter(|(id, _)| nodes.contains(id))));
121///     }
122/// }
123/// ```
124#[derive(Clone, Debug, Default, PartialEq, Eq)]
125pub struct Subgraph {
126    // Node records for the subgraph.
127    records: BTreeMap<usize, GBZRecord>,
128
129    // Paths in the subgraph.
130    paths: Vec<PathInfo>,
131
132    // Offset in `paths` for the reference path, if any.
133    ref_id: Option<usize>,
134
135    // Metadata for the reference path, if any.
136    ref_path: Option<FullPathName>,
137
138    // Interval of the reference path that is present in the subgraph, if any.
139    ref_interval: Option<Range<usize>>,
140
141    // Stable graph name.
142    graph_name: Option<GraphName>,
143}
144
145//-----------------------------------------------------------------------------
146
147/// Construction.
148impl Subgraph {
149    /// Creates a new empty subgraph.
150    pub fn new() -> Self {
151        Subgraph::default()
152    }
153
154    /// Returns the path position for the haplotype offset represented by the query position.
155    ///
156    /// `query_pos.fragment` is used as an offset in the haplotype.
157    /// The return value consists of the position and metadata for the path covering the position.
158    /// Updates the subgraph to include the path from the nearest indexed position to the query position.
159    ///
160    /// # Arguments
161    ///
162    /// * `graph`: A GBZ graph.
163    /// * `path_index`: A path index for the graph.
164    /// * `query_pos`: Query position.
165    ///
166    /// # Errors
167    ///
168    /// Returns an error if there is no path covering the given position or the path has not been indexed for random access.
169    ///
170    /// # Examples
171    ///
172    /// ```
173    /// use gbz_base::{GBZRecord, Subgraph, GraphReference, PathIndex};
174    /// use gbz::{GBZ, FullPathName, Orientation};
175    /// use gbz::support;
176    /// use simple_sds::serialize;
177    ///
178    /// // Get the graph.
179    /// let gbz_file = support::get_test_data("example.gbz");
180    /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
181    ///
182    /// // Create a path index with 3 bp intervals.
183    /// let path_index = PathIndex::new(&graph, 3, false).unwrap();
184    ///
185    /// // Query for path A offset 2.
186    /// let mut query_pos = FullPathName::generic("A");
187    /// query_pos.fragment = 2;
188    /// let mut subgraph = Subgraph::new();
189    /// let result = subgraph.path_pos_from_gbz(&graph, &path_index, &query_pos);
190    /// assert!(result.is_ok());
191    /// let (path_pos, path_name) = result.unwrap();
192    /// assert_eq!(path_pos.seq_offset(), query_pos.fragment);
193    ///
194    /// // The query position is at the start of node 14 in forward orientation.
195    /// assert_eq!(path_pos.node_id(), 14);
196    /// assert_eq!(path_pos.orientation(), Orientation::Forward);
197    /// assert_eq!(path_pos.node_offset(), 0);
198    ///
199    /// // And this happens to be the first path visit to the node.
200    /// let gbwt_node = support::encode_node(14, Orientation::Forward);
201    /// assert_eq!(path_pos.handle(), gbwt_node);
202    /// assert_eq!(path_pos.gbwt_offset(), 0);
203    ///
204    /// // And it is covered by the path fragment starting from offset 0.
205    /// assert_eq!(path_name, FullPathName::generic("A"));
206    ///
207    /// // Now extract 1 bp context around interval 2..4.
208    /// let result = subgraph.around_interval(GraphReference::Gbz(&graph), path_pos, 2, 1);
209    /// assert!(result.is_ok());
210    ///
211    /// // The interval corresponds to nodes 14 and 15.
212    /// let true_nodes = vec![12, 13, 14, 15, 16, 17];
213    /// assert_eq!(subgraph.nodes(), true_nodes.len());
214    /// assert!(subgraph.node_iter().eq(true_nodes.iter().copied()));
215    /// ```
216    pub fn path_pos_from_gbz(
217        &mut self,
218        graph: &GBZ,
219        path_index: &PathIndex,
220        query_pos: &FullPathName
221    ) -> Result<(PathPosition, FullPathName), String> {
222        let path = GBZPath::with_name(graph, query_pos).ok_or_else(||
223            format!("Cannot find a path covering {}", query_pos)
224        )?;
225        // Transform the offset relative to the haplotype to the offset relative to the path.
226        let query_offset = query_pos.fragment - path.name.fragment;
227
228        // Path id to an indexed position.
229        let index_offset = path_index.path_to_offset(path.handle).ok_or_else(||
230            format!("Path {} has not been indexed for random access", path.name())
231        )?;
232        let (path_offset, pos) = path_index.indexed_position(index_offset, query_offset).unwrap();
233
234        self.find_path_pos(GraphReference::Gbz(graph), query_offset, path_offset, pos, path.name)
235    }
236
237    /// Returns the path position for the haplotype offset represented by the query position.
238    ///
239    /// `query_pos.fragment` is used as an offset in the haplotype.
240    /// The return value consists of the position and metadata for the path covering the position.
241    /// Updates the subgraph to include the path from the nearest indexed position to the query position.
242    ///
243    /// # Arguments
244    ///
245    /// * `graph`: A graph interface.
246    /// * `query_pos`: Query position.
247    ///
248    /// # Errors
249    ///
250    /// Returns an error if database operations fail.
251    /// Returns an error if there is no path covering the given position or the path has not been indexed for random access.
252    ///
253    /// # Examples
254    ///
255    /// ```
256    /// use gbz_base::{GBZBase, GraphInterface, Subgraph};
257    /// use gbz::{FullPathName, Orientation};
258    /// use gbz::support;
259    /// use simple_sds::serialize;
260    /// use std::fs;
261    ///
262    /// // Create the database.
263    /// let gbz_file = support::get_test_data("example.gbz");
264    /// let db_file = serialize::temp_file_name("subgraph");
265    /// let result = GBZBase::create_from_files(&gbz_file, None, &db_file);
266    /// assert!(result.is_ok());
267    ///
268    /// // Open the database and create a graph interface.
269    /// let database = GBZBase::open(&db_file).unwrap();
270    /// let mut interface = GraphInterface::new(&database).unwrap();
271    ///
272    /// // Query for path A offset 2.
273    /// let mut query_pos = FullPathName::generic("A");
274    /// query_pos.fragment = 2;
275    /// let mut subgraph = Subgraph::new();
276    /// let result = subgraph.path_pos_from_db(&mut interface, &query_pos);
277    /// assert!(result.is_ok());
278    /// let (path_pos, path_name) = result.unwrap();
279    /// assert_eq!(path_pos.seq_offset(), query_pos.fragment);
280    ///
281    /// // The query position is at the start of node 14 in forward orientation.
282    /// assert_eq!(path_pos.node_id(), 14);
283    /// assert_eq!(path_pos.orientation(), Orientation::Forward);
284    /// assert_eq!(path_pos.node_offset(), 0);
285    ///
286    /// // And this happens to be the first path visit to the node.
287    /// let gbwt_node = support::encode_node(14, Orientation::Forward);
288    /// assert_eq!(path_pos.handle(), gbwt_node);
289    /// assert_eq!(path_pos.gbwt_offset(), 0);
290    ///
291    /// // And it is covered by the path fragment starting from offset 0.
292    /// assert_eq!(path_name, FullPathName::generic("A"));
293    ///
294    /// // Clean up.
295    /// drop(interface);
296    /// drop(database);
297    /// fs::remove_file(&db_file).unwrap();
298    /// ```
299    pub fn path_pos_from_db<'reference, 'graph>(
300        &mut self,
301        graph: &'reference mut GraphInterface<'graph>,
302        query_pos: &FullPathName
303    ) -> Result<(PathPosition, FullPathName), String> {
304        let path = graph.find_path(query_pos)?;
305        let path = path.ok_or_else(|| format!("Cannot find a path covering {}", query_pos))?;
306        if !path.is_indexed {
307            return Err(format!("Path {} has not been indexed for random access", query_pos));
308        }
309        // Transform the offset relative to the haplotype to the offset relative to the path.
310        let query_offset = query_pos.fragment - path.name.fragment;
311
312        // Find an indexed position before the query position.
313        let result = graph.indexed_position(path.handle, query_offset)?;
314        let (path_offset, pos) = result.ok_or_else(||
315            format!("Path {} has not been indexed for random access", path.name())
316        )?;
317
318        self.find_path_pos(GraphReference::Db(graph), query_offset, path_offset, pos, path.name)
319    }
320
321    // Shared functionality for `path_pos_from_gbz` and `path_pos_from_db`.
322    fn find_path_pos(
323        &mut self,
324        graph: GraphReference<'_, '_>,
325        query_offset: usize,
326        path_offset: usize,
327        pos: Pos,
328        path_name: FullPathName
329    ) -> Result<(PathPosition, FullPathName), String> {
330        // Iterate over the path until the query position, updating the subgraph.
331        let mut path_offset = path_offset;
332        let mut pos = pos;
333        let mut node_offset: Option<usize> = None;
334        let mut gbwt_pos: Option<Pos> = None;
335        let mut graph = graph;
336        loop {
337            let node_id = support::node_id(pos.node);
338            if !self.has_node(node_id) {
339                self.add_node_internal(&mut graph, node_id)?;
340            }
341            let record = self.record(pos.node).unwrap();
342            if path_offset + record.sequence_len() > query_offset {
343                node_offset = Some(query_offset - path_offset);
344                gbwt_pos = Some(pos);
345                break;
346            }
347            path_offset += record.sequence_len();
348            let next = record.to_gbwt_record().lf(pos.offset);
349            if next.is_none() {
350                break;
351            }
352            pos = next.unwrap();
353        }
354
355        let node_offset = node_offset.ok_or_else(||
356            format!("Path {} does not contain offset {}", path_name, query_offset)
357        )?;
358        let gbwt_pos = gbwt_pos.unwrap();
359
360        let path_position = PathPosition {
361            seq_offset: query_offset,
362            gbwt_node: gbwt_pos.node,
363            node_offset,
364            gbwt_offset: gbwt_pos.offset,
365        };
366        Ok((path_position, path_name))
367    }
368
369    /// Updates the subgraph to a context around the given graph position.
370    ///
371    /// Reuses existing records when possible.
372    /// Removes node records outside the context as well as all path information.
373    /// Returns the number of inserted and removed nodes.
374    /// See [`Subgraph`] for an example.
375    ///
376    /// # Arguments
377    ///
378    /// * `graph`: A GBZ-compatible graph.
379    /// * `pos`: The reference position for the subgraph.
380    /// * `context`: The context length around the reference position (in bp).
381    ///
382    /// # Errors
383    ///
384    /// Passes through any errors from accessing the graph.
385    pub fn around_position(
386        &mut self,
387        graph: GraphReference<'_, '_>,
388        pos: GraphPosition,
389        context: usize
390    ) -> Result<(usize, usize), String> {
391        // The initial node is always in the subgraph, so we might as well add it now to determine sequence length.
392        let mut graph = graph;
393        if !self.has_node(pos.node) {
394            self.add_node_internal(&mut graph, pos.node)?;
395        }
396        let handle = pos.to_gbwt();
397        let record = self.record(handle).unwrap();
398
399        // Start the graph traversal from both sides of the initial node.
400        let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
401        let start = (pos.node, support::entry_side(pos.orientation));
402        active.push(Reverse((pos.offset, start)));
403        let end_distance = record.sequence_len() - pos.offset - 1;
404        let end = (pos.node, support::exit_side(pos.orientation));
405        active.push(Reverse((end_distance, end)));
406
407        self.insert_context(graph, active, context)
408    }
409
410    /// Updates the subgraph to a context around the given path interval.
411    ///
412    /// Reuses existing records when possible.
413    /// Removes node records outside the context as well as all path information.
414    /// Returns the number of inserted and removed nodes.
415    /// See [`Self::path_pos_from_gbz`] for an example.
416    ///
417    /// # Arguments
418    ///
419    /// * `graph`: A GBZ-compatible graph.
420    /// * `start_pos`: The starting position for the interval.
421    /// * `len`: Length of the interval (in bp).
422    /// * `context`: The context length around the reference interval (in bp).
423    ///
424    /// # Errors
425    ///
426    /// Passes through any errors from accessing the graph.
427    /// Returns an error if the interval is empty, starts outside the initial node, or is longer than the remaining path.
428    pub fn around_interval(
429        &mut self,
430        graph: GraphReference<'_, '_>,
431        start_pos: PathPosition,
432        len: usize,
433        context: usize
434    ) -> Result<(usize, usize), String> {
435        if len == 0 {
436            return Err(String::from("Interval length must be greater than 0"));
437        }
438        let mut pos = start_pos.gbwt_pos();
439        let mut offset = start_pos.node_offset();
440        let mut len = len;
441
442        // Insert all nodes in the interval to the subgraph and to active node sides.
443        let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
444        let mut graph = graph;
445        loop {
446            let node_id = support::node_id(pos.node);
447            let orientation = support::node_orientation(pos.node);
448            if !self.has_node(node_id) {
449                self.add_node_internal(&mut graph, node_id)?;
450            }
451            let record = self.record(pos.node).unwrap();
452            if offset >= record.sequence_len() {
453                return Err(format!("Offset {} in node {} of length {}", offset, node_id, record.sequence_len()));
454            }
455
456            // Handle the current node.
457            let start = (node_id, support::entry_side(orientation));
458            active.push(Reverse((offset, start)));
459            let end = (node_id, support::exit_side(orientation));
460            let distance_to_next = record.sequence_len() - offset;
461            if len <= distance_to_next {
462                let end_distance = if len == distance_to_next { 0 } else { distance_to_next - len - 1 };
463                active.push(Reverse((end_distance, end)));
464                break;
465            } else {
466                active.push(Reverse((0, end)));
467            }
468
469            // Proceed to the next node.
470            if let Some(next) = record.to_gbwt_record().lf(pos.offset) {
471                pos = next;
472            } else {
473                return Err(format!("No successor for GBWT position ({}, {})", pos.node, pos.offset));
474            }
475            offset = 0;
476            len -= distance_to_next;
477        }
478
479        self.insert_context(graph, active, context)
480    }
481
482    /// Updates the subgraph to a context around the given nodes.
483    ///
484    /// Reuses existing records when possible.
485    /// Removes node records outside the context as well as all path information.
486    /// Returns the number of inserted and removed nodes.
487    ///
488    /// # Arguments
489    ///
490    /// * `graph`: A GBZ-compatible graph.
491    /// * `nodes`: Set of reference nodes for the subgraph.
492    /// * `context`: The context length around the reference node (in bp).
493    ///
494    /// # Errors
495    ///
496    /// Passes through any errors from accessing the graph.
497    ///
498    /// # Examples
499    ///
500    /// ```
501    /// use gbz_base::{GBZRecord, Subgraph, GraphReference};
502    /// use gbz::GBZ;
503    /// use gbz::support;
504    /// use simple_sds::serialize;
505    /// use std::collections::BTreeSet;
506    ///
507    /// // Get the graph.
508    /// let gbz_file = support::get_test_data("translation.gbz");
509    /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
510    ///
511    /// // Extract a subgraph that contains an 1 bp context around node 5.
512    /// let mut subgraph = Subgraph::new();
513    /// let mut nodes = BTreeSet::new();
514    /// nodes.insert(5);
515    /// let result = subgraph.around_nodes(GraphReference::Gbz(&graph), &nodes, 1);
516    /// assert!(result.is_ok());
517    ///
518    /// // The subgraph should be centered around 2 bp node 5 of degree 3.
519    /// let true_nodes = [3, 4, 5, 6];
520    /// assert_eq!(subgraph.nodes(), true_nodes.len());
521    /// assert!(subgraph.node_iter().eq(true_nodes.iter().copied()));
522    ///
523    /// // But there are no paths.
524    /// assert_eq!(subgraph.paths(), 0);
525    /// ```
526    pub fn around_nodes(
527        &mut self,
528        graph: GraphReference<'_, '_>,
529        nodes: &BTreeSet<usize>,
530        context: usize
531    ) -> Result<(usize, usize), String> {
532        // Start the graph traversal from both sides of the initial nodes.
533        let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
534        for &node_id in nodes {
535            let start = (node_id, NodeSide::Left);
536            active.push(Reverse((0, start)));
537            let end = (node_id, NodeSide::Right);
538            active.push(Reverse((0, end)));
539        }
540
541        self.insert_context(graph, active, context)
542    }
543
544    // Inserts all nodes within the context, starting from the active node sides.
545    //
546    // Returns the number of inserted and removed nodes.
547    // All nodes in `active` are assumed to be in the subgraph, even if the distances to the sides exceed the context.
548    // Reuses existing records if possible.
549    // Removes existing nodes that are not in the new context.
550    // Clears all path information.
551    fn insert_context(
552        &mut self,
553        graph: GraphReference<'_, '_>,
554        active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>>,
555        context: usize
556    ) -> Result<(usize, usize), String> {
557        self.clear_paths();
558
559        let mut active = active;
560        let mut visited: BTreeSet<(usize, NodeSide)> = BTreeSet::new();
561        let mut to_remove: BTreeSet<usize> = self.node_iter().collect();
562        let mut inserted = 0;
563        let mut graph = graph;
564
565        while !active.is_empty() {
566            let (distance, node_side) = active.pop().unwrap().0;
567            if visited.contains(&node_side) {
568                continue;
569            }
570            visited.insert(node_side);
571            to_remove.remove(&node_side.0);
572            if !self.has_node(node_side.0) {
573                self.add_node_internal(&mut graph, node_side.0)?;
574                inserted += 1;
575            }
576
577            // We can reach the other side by traversing the node.
578            let other_side = (node_side.0, node_side.1.flip());
579            if !visited.contains(&other_side) {
580                let handle = support::encode_node(node_side.0, support::entry_orientation(node_side.1));
581                let record = self.record(handle).unwrap();
582                let next_distance = distance + record.sequence_len() - 1;
583                if next_distance <= context {
584                    active.push(Reverse((next_distance, other_side)));
585                }
586            }
587
588            // The predecessors of this node side are 1 bp away.
589            let handle = support::encode_node(node_side.0, support::exit_orientation(node_side.1));
590            let record = self.record(handle).unwrap();
591            let next_distance = distance + 1;
592            if next_distance <= context {
593                for successor in record.successors() {
594                    let node_id = support::node_id(successor);
595                    let side = support::entry_side(support::node_orientation(successor));
596                    if !visited.contains(&(node_id, side)) {
597                        active.push(Reverse((next_distance, (node_id, side))));
598                    }
599                }
600            }
601        }
602
603        let removed = to_remove.len();
604        for node_id in to_remove {
605            self.remove_node_internal(node_id);
606        }
607        Ok((inserted, removed))
608    }
609
610    /// Inserts all nodes between the given two handles into the subgraph.
611    ///
612    /// If `start` and `end` are in the same chain in the given order, this will insert all nodes and snarls between them.
613    /// Otherwise the behavior will be unpredictable and it is recommended to set a safety limit to avoid excessive memory usage.
614    /// Removes all path information.
615    /// Returns the number of inserted nodes.
616    ///
617    /// # Arguments
618    ///
619    /// * `graph`: A GBZ-compatible graph.
620    /// * `start`: Start handle (inclusive).
621    /// * `end`: End handle (inclusive).
622    /// * `limit`: Optional safety limit on the number of inserted nodes.
623    ///
624    /// # Examples
625    ///
626    /// ```
627    /// use gbz_base::{GBZRecord, Subgraph, GraphReference};
628    /// use gbz::{GBZ, Orientation};
629    /// use gbz::support;
630    /// use simple_sds::serialize;
631    ///
632    /// let graph_file = support::get_test_data("example.gbz");
633    /// let graph: GBZ = serialize::load_from(&graph_file).unwrap();
634    ///
635    /// let start = support::encode_node(14, Orientation::Forward);
636    /// let end = support::encode_node(17, Orientation::Forward);
637    /// let mut subgraph = Subgraph::new();
638    /// let result = subgraph.between_nodes(GraphReference::Gbz(&graph), start, end, None);
639    /// assert_eq!(result, Ok(4));
640    ///
641    /// let expected_nodes = [14, 15, 16, 17];
642    /// assert!(subgraph.node_iter().eq(expected_nodes.iter().copied()));
643    /// ```
644    pub fn between_nodes(&mut self, graph: GraphReference<'_, '_>, start: usize, end: usize, limit: Option<usize>) -> Result<usize, String> {
645        self.clear_paths();
646
647        // Active handles. We proceed to their successors but not predecessors.
648        let mut active = vec![start, support::flip_node(end)];
649        // Visited node identifiers.
650        let mut visited = HashSet::new();
651        visited.insert(support::decode_node(start).0);
652        visited.insert(support::decode_node(end).0);
653
654        // Determine node ids between the boundary nodes, inclusive.
655        // Add them to the subgraph if they are not already present.
656        let mut graph = graph;
657        let mut inserted = 0;
658        while let Some(curr) = active.pop() {
659            let (node_id, orientation) = support::decode_node(curr);
660            if !self.has_node(node_id) {
661                if let Some(limit) = limit && inserted >= limit {
662                    let (start_id, start_o) = support::decode_node(start);
663                    let (end_id, end_o) = support::decode_node(end);
664                    return Err(format!("Found more than {} new nodes between ({}, {}) and ({}, {})", limit, start_id, start_o, end_id, end_o));
665                }
666                self.add_node_internal(&mut graph, node_id)?;
667                inserted += 1;
668            }
669
670            for (next_id, next_o) in self.supergraph_successors(node_id, orientation).unwrap() {
671                if visited.contains(&next_id) {
672                    continue;
673                }
674                let fw_handle = support::encode_node(next_id, next_o);
675                let rev_handle = support::flip_node(fw_handle);
676                active.push(fw_handle); active.push(rev_handle);
677                visited.insert(next_id);
678            }
679        }
680
681        Ok(inserted)
682    }
683
684    /// Extracts nodes in top-level snarls covered by the current subgraph.
685    ///
686    /// Returns the number of inserted nodes.
687    /// Removes all path information.
688    /// If a chains are provided, this will use them for determining the top-level snarls.
689    /// Otherwise the links stored in node records are used.
690    /// Note that chains must be always provided when the subgraph has been extracted from a [`GBZ`] graph.
691    ///
692    /// # Errors
693    ///
694    /// Returns an error if the graph reference is [`GraphReference::None`].
695    /// Passes through errors from accessing the graph.
696    ///
697    /// # Examples
698    ///
699    /// ```
700    /// use gbz::{GBZ, Orientation};
701    /// use gbz::support::{self, Chains};
702    /// use gbz_base::{Subgraph, GraphReference};
703    /// use gbz_base::utils;
704    /// use simple_sds::serialize;
705    /// use std::collections::BTreeSet;
706    ///
707    /// let graph_file = support::get_test_data("example.gbz");
708    /// let graph: GBZ = serialize::load_from(&graph_file).unwrap();
709    /// let chains_file = utils::get_test_data("example.chains");
710    /// let chains: Chains = serialize::load_from(&chains_file).unwrap();
711    ///
712    /// // Extract the boundary nodes of a snarl as the initial subgraph.
713    /// let mut subgraph = Subgraph::new();
714    /// let nodes: BTreeSet<usize> = [14, 17].into_iter().collect();
715    /// let result = subgraph.around_nodes(GraphReference::Gbz(&graph), &nodes, 0);
716    /// assert!(result.is_ok());
717    /// assert!(subgraph.node_iter().eq(nodes.iter().copied()));
718    ///
719    /// // Now extract the snarl between the boundary nodes.
720    /// let result = subgraph.extract_snarls(GraphReference::Gbz(&graph), Some(&chains));
721    /// assert!(result.is_ok());
722    /// let snarl_nodes = [14, 15, 16, 17];
723    /// assert!(subgraph.node_iter().eq(snarl_nodes.iter().copied()));
724    /// ```
725    pub fn extract_snarls(&mut self, graph: GraphReference<'_, '_>, chains: Option<&Chains>) -> Result<usize, String> {
726        let snarls = self.covered_snarls(chains);
727
728        let mut inserted = 0;
729        let mut graph = graph;
730        for (start, end) in snarls {
731            match &mut graph {
732                GraphReference::Gbz(graph) => {
733                    inserted += self.between_nodes(GraphReference::Gbz(graph), start, end, None)?;
734                }
735                GraphReference::Db(graph) => {
736                    inserted += self.between_nodes(GraphReference::Db(graph), start, end, None)?;
737                },
738                GraphReference::None => {
739                    return Err(String::from("No graph reference provided"));
740                }
741            }
742        }
743
744        Ok(inserted)
745    }
746
747    /// Extracts a subgraph around the given query position.
748    ///
749    /// Reuses existing records when possible.
750    /// Removes node records not covered by the query.
751    ///
752    /// # Arguments
753    ///
754    /// * `graph`: A GBZ graph.
755    /// * `path_index`: A path index for the graph, if the query is path-based.
756    /// * `chains`: A set of top-level chains, if the query extracts nodes in covered snarls.
757    /// * `query`: Arguments for extracting the subgraph.
758    ///
759    /// # Errors
760    ///
761    /// Returns an error, if:
762    ///
763    /// * The query or the graph is invalid.
764    /// * The graph does not contain the queried position.
765    /// * A path index is required but not provided, or if the query path has not been indexed.
766    ///
767    /// If an error occurs, the subgraph may contain arbitrary nodes but no paths.
768    ///
769    /// # Examples
770    ///
771    /// ```
772    /// use gbz_base::{PathIndex, Subgraph, SubgraphQuery, HaplotypeOutput};
773    /// use gbz::{GBZ, FullPathName};
774    /// use gbz::support;
775    /// use simple_sds::serialize;
776    ///
777    /// // Get the graph.
778    /// let gbz_file = support::get_test_data("example.gbz");
779    /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
780    ///
781    /// // Create a path index with 3 bp intervals.
782    /// let path_index = PathIndex::new(&graph, 3, false).unwrap();
783    ///
784    /// // Extract a subgraph that contains an 1 bp context around path A offset 2.
785    /// let path_name = FullPathName::generic("A");
786    /// let query = SubgraphQuery::path_offset(&path_name, 2).with_context(1);
787    /// let mut subgraph = Subgraph::new();
788    /// let result = subgraph.from_gbz(&graph, Some(&path_index), None, &query);
789    /// assert!(result.is_ok());
790    ///
791    /// // The subgraph should be centered around 1 bp node 14 of degree 4.
792    /// assert_eq!(subgraph.nodes(), 5);
793    /// assert_eq!(subgraph.paths(), 3);
794    ///
795    /// // We get the same result using a node id.
796    /// let query = SubgraphQuery::nodes([14]).with_context(1);
797    /// let mut subgraph = Subgraph::new();
798    /// let result = subgraph.from_gbz(&graph, None, None, &query);
799    /// assert!(result.is_ok());
800    /// assert_eq!(subgraph.nodes(), 5);
801    /// assert_eq!(subgraph.paths(), 3);
802    /// ```
803    pub fn from_gbz(&mut self, graph: &GBZ, path_index: Option<&PathIndex>, chains: Option<&Chains>, query: &SubgraphQuery) -> Result<(), String> {
804        if query.snarls() && chains.is_none() {
805            return Err(String::from("Top-level chains are required for extracting snarls"));
806        }
807        match query.query_type() {
808            QueryType::PathOffset(query_pos) => {
809                let path_index = path_index.ok_or_else(||
810                    String::from("Path index is required for path-based queries")
811                )?;
812                let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
813                self.around_position(GraphReference::Gbz(graph), reference_path.0.graph_pos(), query.context())?;
814                if query.snarls() {
815                    self.extract_snarls(GraphReference::Gbz(graph), chains)?;
816                }
817                self.extract_paths(Some(reference_path), query.output())?;
818            },
819            QueryType::PathInterval(query_pos, len) => {
820                let path_index = path_index.ok_or_else(||
821                    String::from("Path index is required for path-based queries")
822                )?;
823                let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
824                self.around_interval(GraphReference::Gbz(graph), reference_path.0, *len, query.context())?;
825                if query.snarls() {
826                    self.extract_snarls(GraphReference::Gbz(graph), chains)?;
827                }
828                self.extract_paths(Some(reference_path), query.output())?;
829            },
830            QueryType::Nodes(nodes) => {
831                if query.output() == HaplotypeOutput::ReferenceOnly {
832                    return Err(String::from("Cannot output a reference path in a node-based query"));
833                }
834                self.around_nodes(GraphReference::Gbz(graph), nodes, query.context())?;
835                if query.snarls() {
836                    self.extract_snarls(GraphReference::Gbz(graph), chains)?;
837                }
838                self.extract_paths(None, query.output())?;
839            },
840            QueryType::Between((start, end), limit) => {
841                if query.output() == HaplotypeOutput::ReferenceOnly {
842                    return Err(String::from("Cannot output a reference path in a node-based query"));
843                }
844                self.between_nodes(GraphReference::Gbz(graph), *start, *end, *limit)?;
845                self.extract_paths(None, query.output())?;
846            },
847        }
848
849        // Determine the stable graph name and relationships.
850        let parent = GraphName::from_gbz(graph);
851        self.compute_name(Some(&parent));
852
853        Ok(())
854    }
855
856    /// Extracts a subgraph around the given query position.
857    ///
858    /// Reuses existing records when possible.
859    /// Removes node records not covered by the query.
860    ///
861    /// # Errors
862    ///
863    /// Returns an error, if:
864    ///
865    /// * The query or the graph is invalid or if there is a database error.
866    /// * The graph does not contain the queried position.
867    /// * A path index is required but not provided, or if the query path has not been indexed.
868    ///
869    /// If an error occurs, the subgraph may contain arbitrary nodes but no paths.
870    ///
871    /// # Examples
872    ///
873    /// ```
874    /// use gbz_base::{GBZBase, GraphInterface, Subgraph, SubgraphQuery, HaplotypeOutput};
875    /// use gbz::FullPathName;
876    /// use gbz::support;
877    /// use simple_sds::serialize;
878    /// use std::fs;
879    ///
880    /// // Create the database.
881    /// let gbz_file = support::get_test_data("example.gbz");
882    /// let db_file = serialize::temp_file_name("subgraph");
883    /// let result = GBZBase::create_from_files(&gbz_file, None, &db_file);
884    /// assert!(result.is_ok());
885    ///
886    /// // Open the database and create a graph interface.
887    /// let database = GBZBase::open(&db_file).unwrap();
888    /// let mut interface = GraphInterface::new(&database).unwrap();
889    ///
890    /// // Extract a subgraph that contains an 1 bp context around path A offset 2.
891    /// let path_name = FullPathName::generic("A");
892    /// let query = SubgraphQuery::path_offset(&path_name, 2).with_context(1);
893    /// let mut subgraph = Subgraph::new();
894    /// let result = subgraph.from_db(&mut interface, &query);
895    /// assert!(result.is_ok());
896    ///
897    /// // The subgraph should be centered around 1 bp node 14 of degree 4.
898    /// assert_eq!(subgraph.nodes(), 5);
899    /// assert_eq!(subgraph.paths(), 3);
900    ///
901    /// // Clean up.
902    /// drop(interface);
903    /// drop(database);
904    /// fs::remove_file(&db_file).unwrap();
905    /// ```
906    pub fn from_db<'reference, 'graph>(&mut self, graph: &'reference mut GraphInterface<'graph>, query: &SubgraphQuery) -> Result<(), String> {
907        match query.query_type() {
908            QueryType::PathOffset(query_pos) => {
909                let reference_path = self.path_pos_from_db(graph, query_pos)?;
910                self.around_position(GraphReference::Db(graph), reference_path.0.graph_pos(), query.context())?;
911                if query.snarls() {
912                    self.extract_snarls(GraphReference::Db(graph), None)?;
913                }
914                self.extract_paths(Some(reference_path), query.output())?;
915            },
916            QueryType::PathInterval(query_pos, len) => {
917                let reference_path = self.path_pos_from_db(graph, query_pos)?;
918                self.around_interval(GraphReference::Db(graph), reference_path.0, *len, query.context())?;
919                if query.snarls() {
920                    self.extract_snarls(GraphReference::Db(graph), None)?;
921                }
922                self.extract_paths(Some(reference_path), query.output())?;
923            },
924            QueryType::Nodes(nodes) => {
925                if query.output() == HaplotypeOutput::ReferenceOnly {
926                    return Err(String::from("Cannot output a reference path in a node-based query"));
927                }
928                self.around_nodes(GraphReference::Db(graph), nodes, query.context())?;
929                if query.snarls() {
930                    self.extract_snarls(GraphReference::Db(graph), None)?;
931                }
932                self.extract_paths(None, query.output())?;
933            },
934            QueryType::Between((start, end), limit) => {
935                if query.output() == HaplotypeOutput::ReferenceOnly {
936                    return Err(String::from("Cannot output a reference path in a node-based query"));
937                }
938                self.between_nodes(GraphReference::Db(graph), *start, *end, *limit)?;
939                self.extract_paths(None, query.output())?;
940            },
941        }
942
943        // Determine the stable graph name and relationships.
944        let parent = graph.graph_name()?;
945        self.compute_name(Some(&parent));
946
947        Ok(())
948    }
949
950    // Returns the successor position for the given GBWT position, if it is in the subgraph.
951    fn next_pos(pos: Pos, successors: &BTreeMap<usize, Vec<(Pos, bool)>>) -> Option<Pos> {
952        if let Some(v) = successors.get(&pos.node) {
953            let (next, _) = v[pos.offset];
954            if next.node == ENDMARKER || !successors.contains_key(&next.node) {
955                None
956            } else {
957                Some(next)
958            }
959        } else {
960            None
961        }
962    }
963
964    /// Extracts all paths in the subgraph.
965    ///
966    /// Clears the current paths in the subgraph.
967    /// If a reference path is given, one of the paths is assumed to visit the position.
968    /// This will then set all information related to the reference path.
969    ///
970    /// # Arguments
971    ///
972    /// * `reference_path`: Position on the reference path and the name of the path.
973    /// * `output`: How to output the haplotypes.
974    ///
975    /// # Errors
976    ///
977    /// Returns an error if no path visits the reference position.
978    /// Returns an error if reference-only output is requested without a reference path.
979    /// Clears all path information in case of an error.
980    ///
981    /// # Examples
982    ///
983    /// ```
984    /// use gbz_base::{GBZRecord, Subgraph, HaplotypeOutput};
985    /// use gbz::GBZ;
986    /// use gbz::support;
987    /// use simple_sds::serialize;
988    ///
989    /// // Get the graph.
990    /// let gbz_file = support::get_test_data("example.gbz");
991    /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
992    ///
993    /// // Start with an empty subgraph and add a node.
994    /// let mut subgraph = Subgraph::new();
995    /// let result = subgraph.add_node_from_gbz(&graph,15);
996    /// assert!(result.is_ok());
997    ///
998    /// // There are no paths until we extract them.
999    /// assert_eq!(subgraph.paths(), 0);
1000    /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1001    /// assert!(result.is_ok());
1002    /// assert_eq!(subgraph.paths(), 2);
1003    ///
1004    /// // When we change the subgraph, we need to extract the paths again.
1005    /// for node_id in [13, 14] {
1006    ///    let result = subgraph.add_node_from_gbz(&graph, node_id);
1007    ///    assert!(result.is_ok());
1008    /// }
1009    /// assert_eq!(subgraph.paths(), 0);
1010    /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1011    /// assert!(result.is_ok());
1012    /// assert_eq!(subgraph.paths(), 3);
1013    ///
1014    /// // The same is true for removing nodes.
1015    /// for node_id in [14, 15] {
1016    ///     subgraph.remove_node(node_id);
1017    /// }
1018    /// assert_eq!(subgraph.paths(), 0);
1019    /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1020    /// assert!(result.is_ok());
1021    /// assert_eq!(subgraph.paths(), 1);
1022    /// ```
1023    pub fn extract_paths(
1024        &mut self,
1025        reference_path: Option<(PathPosition, FullPathName)>,
1026        output: HaplotypeOutput
1027    ) -> Result<(), String> {
1028        self.clear_paths();
1029
1030        let ref_pos;
1031        if let Some((position, name)) = reference_path {
1032            ref_pos = Some(position);
1033            self.ref_path = Some(name);
1034        } else {
1035            ref_pos = None;
1036        }
1037
1038        // Decompress the GBWT node records for the subgraph.
1039        let mut keys: Vec<usize> = Vec::new();
1040        let mut successors: BTreeMap<usize, Vec<(Pos, bool)>> = BTreeMap::new();
1041        for (handle, gbz_record) in self.records.iter() {
1042            let gbwt_record = gbz_record.to_gbwt_record();
1043            let decompressed: Vec<(Pos, bool)> = gbwt_record.decompress().into_iter().map(|x| (x, false)).collect();
1044            keys.push(*handle);
1045            successors.insert(*handle, decompressed);
1046        }
1047
1048        // Mark the positions that have predecessors in the subgraph.
1049        for handle in keys.iter() {
1050            let decompressed = successors.get(handle).unwrap().clone();
1051            for (pos, _) in decompressed.iter() {
1052                if let Some(v) = successors.get_mut(&pos.node) {
1053                    v[pos.offset].1 = true;
1054                }
1055            }
1056        }
1057
1058        // TODO: Check for infinite loops.
1059        // Extract all paths and note if one of them passes through `ref_pos`.
1060        // `ref_offset` is the offset of the node containing `ref_pos`.
1061        let mut ref_offset: Option<usize> = None;
1062        for (handle, positions) in successors.iter() {
1063            for (offset, (_, has_predecessor)) in positions.iter().enumerate() {
1064                if *has_predecessor {
1065                    continue;
1066                }
1067                let mut curr = Some(Pos::new(*handle, offset));
1068                let mut is_ref = false;
1069                let mut path: Vec<usize> = Vec::new();
1070                let mut len = 0;
1071                while let Some(pos) = curr {
1072                    if let Some(position) = ref_pos.as_ref() && pos == position.gbwt_pos() {
1073                        self.ref_id = Some(self.paths.len());
1074                        ref_offset = Some(path.len());
1075                        is_ref = true;
1076                    }
1077                    path.push(pos.node);
1078                    len += self.records.get(&pos.node).unwrap().sequence_len();
1079                    curr = Self::next_pos(pos, &successors);
1080                }
1081                if is_ref {
1082                    if !support::encoded_path_is_canonical(&path) {
1083                        eprintln!("Warning: the reference path is not in canonical orientation");
1084                    }
1085                    self.paths.push(PathInfo::new(path, len));
1086                } else if support::encoded_path_is_canonical(&path) {
1087                    self.paths.push(PathInfo::new(path, len));
1088                }
1089            }
1090        }
1091
1092        // Now we can set the reference interval.
1093        if let Some(ref_pos) = ref_pos {
1094            if let Some(offset) = ref_offset {
1095                let ref_info = &self.paths[self.ref_id.unwrap()];
1096                self.ref_interval = Some(ref_info.path_interval(self, offset, &ref_pos));
1097            } else {
1098                self.clear_paths();
1099                return Err(String::from("Could not find the reference path"));
1100            }
1101        }
1102
1103        // Haplotype output.
1104        if output == HaplotypeOutput::Distinct {
1105            self.distinct_paths();
1106        } else if output == HaplotypeOutput::ReferenceOnly {
1107            self.reference_only()?;
1108        }
1109
1110        Ok(())
1111    }
1112
1113    // Sorts the paths and merges duplicates, storing the count in the weight field.
1114    fn distinct_paths(&mut self) {
1115        let ref_path = self.ref_id.map(|id| self.paths[id].path.clone());
1116        self.paths.sort_unstable();
1117
1118        let mut new_paths: Vec<PathInfo> = Vec::new();
1119        let mut ref_id = None;
1120        for info in self.paths.iter() {
1121            if new_paths.is_empty() || new_paths.last().unwrap().path != info.path {
1122                if let Some(ref_path) = &ref_path  && info.path == *ref_path {
1123                    ref_id = Some(new_paths.len());
1124                }
1125                new_paths.push(PathInfo::weighted(info.path.clone(), info.len));
1126            } else {
1127                new_paths.last_mut().unwrap().increment_weight();
1128            }
1129        }
1130
1131        self.paths = new_paths;
1132        self.ref_id = ref_id;
1133    }
1134
1135    // Removes all paths except the reference path.
1136    fn reference_only(&mut self) -> Result<(), String> {
1137        if self.ref_id.is_none() {
1138            return Err(String::from("Reference path is required for reference-only output"));
1139        }
1140        let ref_id = self.ref_id.unwrap();
1141        let ref_info = self.paths[ref_id].clone();
1142        self.paths = vec![ref_info];
1143        self.ref_id = Some(0);
1144        Ok(())
1145    }
1146
1147    // Adds a new node to the subgraph.
1148    fn add_node_internal(&mut self, graph: &mut GraphReference<'_, '_>, node_id: usize) -> Result<(), String> {
1149        let forward = graph.gbz_record(support::encode_node(node_id, Orientation::Forward))?;
1150        let reverse = graph.gbz_record(support::encode_node(node_id, Orientation::Reverse))?;
1151        self.records.insert(forward.handle(), forward);
1152        self.records.insert(reverse.handle(), reverse);
1153        Ok(())
1154    }
1155
1156    /// Inserts the given node into the subgraph.
1157    ///
1158    /// No effect if the node is already in the subgraph.
1159    /// Clears all path information from the subgraph.
1160    /// See also [`Self::add_node_from_db`].
1161    ///
1162    /// # Arguments
1163    ///
1164    /// * `graph`: A GBZ graph.
1165    /// * `node_id`: Identifier of the node to insert.
1166    ///
1167    /// # Errors
1168    ///
1169    /// Returns an error if the node does not exist in the graph.
1170    pub fn add_node_from_gbz(&mut self, graph: &GBZ, node_id: usize) -> Result<(), String> {
1171        if self.has_node(node_id) {
1172            return Ok(());
1173        }
1174        self.clear_paths();
1175        self.add_node_internal(&mut GraphReference::Gbz(graph), node_id)
1176    }
1177
1178    /// Inserts the given node into the subgraph.
1179    ///
1180    /// No effect if the node is already in the subgraph.
1181    /// Clears all path information from the subgraph.
1182    /// See also [`Self::add_node_from_gbz`].
1183    ///
1184    /// # Arguments
1185    ///
1186    /// * `graph`: Graph interface.
1187    /// * `node_id`: Identifier of the node to insert.
1188    ///
1189    /// # Errors
1190    ///
1191    /// Returns an error if the node does not exist in the graph.
1192    /// Passes through any errors from the database.
1193    pub fn add_node_from_db(&mut self, graph: &mut GraphInterface, node_id: usize) -> Result<(), String> {
1194        if self.has_node(node_id) {
1195            return Ok(());
1196        }
1197        self.clear_paths();
1198        self.add_node_internal(&mut GraphReference::Db(graph), node_id)
1199    }
1200
1201    // Removes the given node from the subgraph.
1202    fn remove_node_internal(&mut self, node_id: usize) {
1203        self.records.remove(&support::encode_node(node_id, Orientation::Forward));
1204        self.records.remove(&support::encode_node(node_id, Orientation::Reverse));
1205    }
1206
1207    /// Removes the given node from the subgraph.
1208    ///
1209    /// No effect if the node is not in the subgraph.
1210    /// Clears all path information from the subgraph.
1211    pub fn remove_node(&mut self, node_id: usize) {
1212        if !self.has_node(node_id) {
1213            return;
1214        }
1215        self.clear_paths();
1216        self.remove_node_internal(node_id);
1217    }
1218
1219    // Clears all path information from the subgraph.
1220    fn clear_paths(&mut self) {
1221        self.paths.clear();
1222        self.ref_id = None;
1223        self.ref_path = None;
1224        self.ref_interval = None;
1225    }
1226}
1227
1228//-----------------------------------------------------------------------------
1229
1230/// Node/edge operations similar to [`GBZ`].
1231impl Subgraph {
1232    /// Returns the number of nodes in the subgraph.
1233    #[inline]
1234    pub fn nodes(&self) -> usize {
1235        self.records.len() / 2
1236    }
1237
1238    /// Returns the smallest node identifier in the subgraph.
1239    #[inline]
1240    pub fn min_node(&self) -> Option<usize> {
1241        self.records.keys().next().map(|&handle| support::node_id(handle))
1242    }
1243
1244    /// Returns the largest node identifier in the subgraph.
1245    #[inline]
1246    pub fn max_node(&self) -> Option<usize> {
1247        self.records.keys().next_back().map(|&handle| support::node_id(handle))
1248    }
1249
1250    /// Returns the smallest handle in the subgraph.
1251    #[inline]
1252    pub fn min_handle(&self) -> Option<usize> {
1253        self.records.keys().next().copied()
1254    }
1255
1256    /// Returns the largest handle in the subgraph.
1257    #[inline]
1258    pub fn max_handle(&self) -> Option<usize> {
1259        self.records.keys().next_back().copied()
1260    }
1261
1262    /// Returns `true` if the subgraph contains the given node.
1263    #[inline]
1264    pub fn has_node(&self, node_id: usize) -> bool {
1265        self.records.contains_key(&support::encode_node(node_id, Orientation::Forward))
1266    }
1267
1268    // Returns `true` if the subgraph contains the given handle.
1269    #[inline]
1270    pub fn has_handle(&self, handle: usize) -> bool {
1271        self.records.contains_key(&handle)
1272    }
1273
1274    /// Returns the sequence for the handle in the subgraph, or [`None`] if there is no such handle.
1275    #[inline]
1276    pub fn sequence_for_handle(&self, handle: usize) -> Option<&[u8]> {
1277        self.records.get(&handle).map(|record| record.sequence())
1278    }
1279
1280    /// Returns the sequence for the node in the subgraph, or [`None`] if there is no such node.
1281    #[inline]
1282    pub fn sequence(&self, node_id: usize) -> Option<&[u8]> {
1283        self.sequence_for_handle(support::encode_node(node_id, Orientation::Forward))
1284    }
1285
1286    /// Returns the sequence for the node in the given orientation, or [`None`] if there is no such node.
1287    #[inline]
1288    pub fn oriented_sequence(&self, node_id: usize, orientation: Orientation) -> Option<&[u8]> {
1289        self.sequence_for_handle(support::encode_node(node_id, orientation))
1290    }
1291
1292    /// Returns the length of the sequence for the node in the subgraph, or [`None`] if there is no such node.
1293    #[inline]
1294    pub fn sequence_len(&self, node_id: usize) -> Option<usize> {
1295        self.records.get(&support::encode_node(node_id, Orientation::Forward)).map(|record| record.sequence_len())
1296    }
1297
1298    /// Returns an iterator over the node identifiers in the subgraph.
1299    ///
1300    /// The identifiers are listed in ascending order.
1301    pub fn node_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
1302        self.records.keys().step_by(2).map(|&handle| support::node_id(handle))
1303    }
1304
1305    /// Returns an iterator over the handles in the subgraph.
1306    ///
1307    /// The handles are listed in ascending order.
1308    pub fn handle_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
1309        self.records.keys().copied()
1310    }
1311
1312    /// Returns an iterator over the successors of an oriented node, or [`None`] if there is no such node.
1313    pub fn successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1314        let handle = support::encode_node(node_id, orientation);
1315        let record = self.records.get(&handle)?;
1316        Some(EdgeIter::new(self, record, false))
1317    }
1318
1319    /// Returns an iterator over the successors of an oriented node in the supergraph, or [`None`] if there is no such node.
1320    ///
1321    /// This is otherwise the same as [`Self::successors`], but this also lists successors not in the subgraph.
1322    pub fn supergraph_successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1323        let handle = support::encode_node(node_id, orientation);
1324        let record = self.records.get(&handle)?;
1325        Some(EdgeIter::in_supergraph(self, record, false))
1326    }
1327
1328    /// Returns an iterator over the predecessors of an oriented node, or [`None`] if there is no such node.
1329    pub fn predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1330        let handle = support::encode_node(node_id, orientation.flip());
1331        let record = self.records.get(&handle)?;
1332        Some(EdgeIter::new(self, record, true))
1333    }
1334
1335    /// Returns an iterator over the predecessors of an oriented node in the supergraph, or [`None`] if there is no such node.
1336    ///
1337    /// This is otherwise the same as [`Self::predecessors`], but this also lists predecessors not in the subgraph.
1338    pub fn supergraph_predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1339        let handle = support::encode_node(node_id, orientation.flip());
1340        let record = self.records.get(&handle)?;
1341        Some(EdgeIter::in_supergraph(self, record, true))
1342    }
1343
1344    // Returns the record for the given node handle, or [`None`] if the node is not in the subgraph.
1345    #[inline]
1346    fn record(&self, handle: usize) -> Option<&GBZRecord> {
1347        self.records.get(&handle)
1348    }
1349
1350    /// Returns the number of paths in the subgraph.
1351    #[inline]
1352    pub fn paths(&self) -> usize {
1353        self.paths.len()
1354    }
1355
1356    /// Returns the top-level snarls covered by the current subgraph.
1357    ///
1358    /// Each snarl is reported as a pair of handles `(start, end)` of its boundary nodes.
1359    /// Here `start` points inside the snarl and `end` points outside the snarl.
1360    /// The snarls are returned in the canonical orientation (see [`support::edge_is_canonical`]).
1361    ///
1362    /// By default, this uses the chain links stored in node records.
1363    /// However, a [`crate::GBZBase`] does not always store top-level chains, and a [`GBZ`] graph does not have that information.
1364    /// In such cases a [`Chains`] object can be provided as an override.
1365    pub fn covered_snarls(&self, chains: Option<&Chains>) -> BTreeSet<(usize, usize)> {
1366        let mut snarls: BTreeSet<(usize, usize)> = BTreeSet::new();
1367        for (&handle, record) in self.records.iter() {
1368            let next = if let Some(chains) = chains {
1369                chains.next(handle)
1370            } else {
1371                record.next()
1372            };
1373            if next.is_none() {
1374                continue;
1375            }
1376            let next = next.unwrap();
1377            if !self.has_handle(next) {
1378                continue;
1379            }
1380            if support::encoded_edge_is_canonical(handle, next) {
1381                snarls.insert((handle, next));
1382            }
1383        }
1384        snarls
1385    }
1386}
1387
1388//-----------------------------------------------------------------------------
1389
1390/// Alignment to the reference.
1391impl Subgraph {
1392    // Returns the total length of the nodes.
1393    fn path_len(&self, path: &[usize]) -> usize {
1394        let mut result = 0;
1395        for handle in path.iter() {
1396            let record = self.records.get(handle).unwrap();
1397            result += record.sequence_len();
1398        }
1399        result
1400    }
1401
1402    // Appends a new edit or extends the previous one. No effect if `len` is zero.
1403    fn append_edit(edits: &mut Vec<(EditOperation, usize)>, op: EditOperation, len: usize) {
1404        if len == 0 {
1405            return;
1406        }
1407        if let Some((prev_op, prev_len)) = edits.last_mut() && *prev_op == op {
1408            *prev_len += len;
1409            return;
1410        }
1411        edits.push((op, len));
1412    }
1413
1414    // Returns the number of matching bases at the start of the two paths.
1415    fn prefix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
1416        let mut result = 0;
1417        let mut path_offset = 0;
1418        let mut ref_offset = 0;
1419        let mut path_base = 0;
1420        let mut ref_base = 0;
1421        while path_offset < path.len() && ref_offset < ref_path.len() {
1422            let a = self.records.get(&path[path_offset]).unwrap().sequence();
1423            let b = self.records.get(&ref_path[ref_offset]).unwrap().sequence();
1424            while path_base < a.len() && ref_base < b.len() {
1425                if a[path_base] != b[ref_base] {
1426                    return result;
1427                }
1428                path_base += 1;
1429                ref_base += 1;
1430                result += 1;
1431            }
1432            if path_base == a.len() {
1433                path_offset += 1;
1434                path_base = 0;
1435            }
1436            if ref_base == b.len() {
1437                ref_offset += 1;
1438                ref_base = 0;
1439            }
1440        }
1441        result
1442    }
1443
1444    // Returns the number of matching bases at the end of the two paths.
1445    fn suffix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
1446        let mut result = 0;
1447        let mut path_offset = 0;
1448        let mut ref_offset = 0;
1449        let mut path_base = 0;
1450        let mut ref_base = 0;
1451        while path_offset < path.len() && ref_offset < ref_path.len() {
1452            let a = self.records.get(&path[path.len() - path_offset - 1]).unwrap().sequence();
1453            let b = self.records.get(&ref_path[ref_path.len() - ref_offset - 1]).unwrap().sequence();
1454            while path_base < a.len() && ref_base < b.len() {
1455                if a[a.len() - path_base - 1] != b[b.len() - ref_base - 1] {
1456                    return result;
1457                }
1458                path_base += 1;
1459                ref_base += 1;
1460                result += 1;
1461            }
1462            if path_base == a.len() {
1463                path_offset += 1;
1464                path_base = 0;
1465            }
1466            if ref_base == b.len() {
1467                ref_offset += 1;
1468                ref_base = 0;
1469            }
1470        }
1471        result
1472    }
1473
1474    // Returns the penalty for a mismatch of the given length.
1475    fn mismatch_penalty(len: usize) -> usize {
1476        4 * len
1477    }
1478
1479    // Returns the penalty for a gap of the given length.
1480    fn gap_penalty(len: usize) -> usize {
1481        if len == 0 {
1482            0
1483        } else {
1484            6 + (len - 1)
1485        }
1486    }
1487
1488    // Appends edits corresponding to the alignment of the given (sub)paths.
1489    // The paths are assumed to be diverging, but there may be base-level matches at the start/end.
1490    // The middle part is either mismatch + gap or insertion + deletion, with gap length possibly 0.
1491    // Alignment scoring follows the parameters used in vg:
1492    // +1 for a match, -4 for a mismatch, -6 for gap open, -1 for gap extend.
1493    //
1494    // NOTE: It is possible that some of the mismatches are actually matches.
1495    // But we ignore this possibility, as the paths are assumed to be diverging.
1496    fn align(&self, path: &[usize], ref_path: &[usize], edits: &mut Vec<(EditOperation, usize)>) {
1497        let path_len = self.path_len(path);
1498        let ref_len = self.path_len(ref_path);
1499        let prefix = self.prefix_matches(path, ref_path);
1500        let mut suffix = self.suffix_matches(path, ref_path);
1501        if prefix + suffix > path_len {
1502            suffix = path_len - prefix;
1503        }
1504        if prefix + suffix > ref_len {
1505            suffix = ref_len - prefix;
1506        }
1507
1508
1509        Self::append_edit(edits, EditOperation::Match, prefix);
1510        let path_middle = path_len - prefix - suffix;
1511        let ref_middle = ref_len - prefix - suffix;
1512        if path_middle == 0 {
1513            Self::append_edit(edits, EditOperation::Deletion, ref_middle);
1514        } else if ref_middle == 0 {
1515            Self::append_edit(edits, EditOperation::Insertion, path_middle);
1516        } else {
1517            let mismatch = cmp::min(path_middle, ref_middle);
1518            let mismatch_indel =
1519                Self::mismatch_penalty(mismatch) +
1520                Self::gap_penalty(path_middle - mismatch) +
1521                Self::gap_penalty(ref_middle - mismatch);
1522            let insertion_deletion = Self::gap_penalty(path_middle) + Self::gap_penalty(ref_middle);
1523            if mismatch_indel <= insertion_deletion {
1524                Self::append_edit(edits, EditOperation::Match, mismatch);
1525                Self::append_edit(edits, EditOperation::Insertion, path_middle - mismatch);
1526                Self::append_edit(edits, EditOperation::Deletion, ref_middle - mismatch);
1527            } else {
1528                Self::append_edit(edits, EditOperation::Insertion, path_middle);
1529                Self::append_edit(edits, EditOperation::Deletion, ref_middle);
1530            }
1531        }
1532        Self::append_edit(edits, EditOperation::Match, suffix);
1533    }
1534
1535    // Returns the CIGAR string for the given path, aligned to the reference path.
1536    // Takes the alignment from the LCS of the paths weighted by node lengths.
1537    // Diverging parts are aligned using `align()`.
1538    fn align_to_ref(&self, path_id: usize) -> Option<String> {
1539        let ref_id = self.ref_id?;
1540        if path_id == ref_id || path_id >= self.paths.len() {
1541            return None;
1542        }
1543
1544        // Find the LCS of the paths weighted by node lengths.
1545        let weight = &|handle: usize| -> usize {
1546            self.records.get(&handle).unwrap().sequence_len()
1547        };
1548        let path = &self.paths[path_id].path;
1549        let ref_path = &self.paths[ref_id].path;
1550        let (lcs, _) = algorithms::fast_weighted_lcs(path, ref_path, weight);
1551
1552        // Convert the LCS to a sequence of edit operations
1553        let mut edits: Vec<(EditOperation, usize)> = Vec::new();
1554        let mut path_offset = 0;
1555        let mut ref_offset = 0;
1556        for (next_path_offset, next_ref_offset) in lcs.iter() {
1557            let path_interval = &path[path_offset..*next_path_offset];
1558            let ref_interval = &ref_path[ref_offset..*next_ref_offset];
1559            self.align(path_interval, ref_interval, &mut edits);
1560            let node_len = self.records.get(&path[*next_path_offset]).unwrap().sequence_len();
1561            Self::append_edit(&mut edits, EditOperation::Match, node_len);
1562            path_offset = next_path_offset + 1;
1563            ref_offset = next_ref_offset + 1;
1564        }
1565        self.align(&path[path_offset..], &ref_path[ref_offset..], &mut edits);
1566
1567        // Convert the edits to a CIGAR string.
1568        let mut result = String::new();
1569        for (op, len) in edits.iter() {
1570            result.push_str(&format!("{}{}", len, op));
1571        }
1572        Some(result)
1573    }
1574}
1575
1576//-----------------------------------------------------------------------------
1577
1578/// Output formats.
1579impl Subgraph {
1580    /// Writes the subgraph in the GFA format to the given output.
1581    ///
1582    /// The output is a full GFA file, including the header.
1583    /// If `cigar` is true, the CIGAR strings for the non-reference haplotypes are included in the output.
1584    pub fn write_gfa<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
1585        // Header.
1586        let reference_samples = self.ref_path.as_ref().map(|path| path.sample.as_ref());
1587        formats::write_gfa_header(reference_samples, output)?;
1588        if let Some(graph_name) = self.graph_name.as_ref() {
1589            let header_lines = graph_name.to_gfa_header_lines();
1590            formats::write_header_lines(&header_lines, output)?;
1591        }
1592
1593        // Segments.
1594        for (handle, record) in self.records.iter() {
1595            if support::node_orientation(*handle) == Orientation::Forward {
1596                formats::write_gfa_node(record.id(), record.sequence(), output)?;
1597            }
1598        }
1599
1600        // Links.
1601        for (handle, record) in self.records.iter() {
1602            let from = support::decode_node(*handle);
1603            for successor in record.successors() {
1604                let to = support::decode_node(successor);
1605                if self.has_handle(successor) && support::edge_is_canonical(from, to) {
1606                    formats::write_gfa_link(
1607                        (from.0.to_string().as_bytes(), from.1),
1608                        (to.0.to_string().as_bytes(), to.1),
1609                        output
1610                    )?;
1611                }
1612            }
1613        }
1614
1615        // Paths.
1616        if let Some((metadata, ref_id)) = self.ref_metadata() {
1617            formats::write_gfa_walk(&self.paths[ref_id].path, &metadata, output)?;
1618        }
1619        let mut haplotype = 1;
1620        let contig_name = self.contig_name();
1621        for (id, path_info) in self.paths.iter().enumerate() {
1622            if Some(id) == self.ref_id {
1623                continue;
1624            }
1625            let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
1626            metadata.add_weight(path_info.weight);
1627            if cigar {
1628                metadata.add_cigar(self.align_to_ref(id));
1629            }
1630            formats::write_gfa_walk(&path_info.path, &metadata, output)?;
1631            haplotype += 1;
1632        }
1633
1634        Ok(())
1635    }
1636
1637    // TODO: We cannot include graph name, as there is no header information.
1638    /// Writes the subgraph in the JSON format to the given output.
1639    ///
1640    /// If `cigar` is true, the CIGAR strings for the non-reference haplotypes are included in the output.
1641    pub fn write_json<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
1642        // Nodes.
1643        let mut nodes: Vec<JSONValue> = Vec::new();
1644        for (_, record) in self.records.iter() {
1645            let (id, orientation) = support::decode_node(record.handle());
1646            if orientation == Orientation::Reverse {
1647                continue;
1648            }
1649            let node = JSONValue::Object(vec![
1650                ("id".to_string(), JSONValue::String(id.to_string())),
1651                ("sequence".to_string(), JSONValue::String(String::from_utf8_lossy(record.sequence()).to_string())),
1652            ]);
1653            nodes.push(node);
1654        }
1655
1656        // Edges.
1657        let mut edges: Vec<JSONValue> = Vec::new();
1658        for (handle, record) in self.records.iter() {
1659            let from = support::decode_node(*handle);
1660            for successor in record.successors() {
1661                let to = support::decode_node(successor);
1662                if self.has_handle(successor) && support::edge_is_canonical(from, to) {
1663                    let edge = JSONValue::Object(vec![
1664                        ("from".to_string(), JSONValue::String(from.0.to_string())),
1665                        ("from_is_reverse".to_string(), JSONValue::Boolean(from.1 == Orientation::Reverse)),
1666                        ("to".to_string(), JSONValue::String(to.0.to_string())),
1667                        ("to_is_reverse".to_string(), JSONValue::Boolean(to.1 == Orientation::Reverse)),
1668                    ]);
1669                    edges.push(edge);
1670                }
1671            }
1672        }
1673
1674        // Paths.
1675        let mut paths: Vec<JSONValue> = Vec::new();
1676        if let Some((metadata, ref_id)) = self.ref_metadata() {
1677            let ref_path = formats::json_path(&self.paths[ref_id].path, &metadata);
1678            paths.push(ref_path);
1679        }
1680        let mut haplotype = 1;
1681        let contig_name = self.contig_name();
1682        for (id, path_info) in self.paths.iter().enumerate() {
1683            if Some(id) == self.ref_id {
1684                continue;
1685            }
1686            let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
1687            metadata.add_weight(path_info.weight);
1688            if cigar {
1689                metadata.add_cigar(self.align_to_ref(id));
1690            }
1691            let path = formats::json_path(&path_info.path, &metadata);
1692            paths.push(path);
1693            haplotype += 1;
1694        }
1695
1696        let result = JSONValue::Object(vec![
1697            ("nodes".to_string(), JSONValue::Array(nodes)),
1698            ("edges".to_string(), JSONValue::Array(edges)),
1699            ("paths".to_string(), JSONValue::Array(paths)),
1700        ]);
1701        output.write_all(result.to_string().as_bytes())?;
1702
1703        Ok(())
1704    }
1705
1706    // Builds metadata for the reference path.
1707    fn ref_metadata(&self) -> Option<(WalkMetadata, usize)> {
1708        let ref_id = self.ref_id?;
1709        let ref_path = self.ref_path.as_ref()?;
1710        let interval = self.ref_interval.as_ref()?.clone();
1711        let mut metadata = WalkMetadata::path_interval(ref_path, interval);
1712        metadata.add_weight(self.paths[ref_id].weight);
1713        Some((metadata, ref_id))
1714    }
1715
1716    // Determines a contig name for the subgraph.
1717    fn contig_name(&self) -> String {
1718        if let Some(ref_path) = self.ref_path.as_ref() {
1719            ref_path.contig.clone()
1720        } else {
1721            String::from("unknown")
1722        }
1723    }
1724}
1725
1726//-----------------------------------------------------------------------------
1727
1728/// Graph names.
1729impl Subgraph {
1730    /// Returns the stable graph name (pggname) for the subgraph.
1731    pub fn graph_name(&self) -> Option<&GraphName> {
1732        self.graph_name.as_ref()
1733    }
1734
1735    /// Returns `true` if the subgraph has a graph name.
1736    pub fn has_graph_name(&self) -> bool {
1737        self.graph_name.is_some()
1738    }
1739
1740    /// Computes and stores the stable graph name (pggname) for the subgraph.
1741    ///
1742    /// If the name of the parent graph is given, this graph will be marked as its subgraph.
1743    /// All other graph relationships are also copied from the parent.
1744    /// No effect if the subgraph already has a graph name.
1745    pub fn compute_name(&mut self, parent: Option<&GraphName>) {
1746        if self.has_graph_name() {
1747            return;
1748        }
1749        let name = pggname::stable_name(self);
1750        let mut result = GraphName::new(name);
1751        if let Some(parent) = parent {
1752            result.make_subgraph_of(parent);
1753        }
1754        self.graph_name = Some(result);
1755    }
1756}
1757
1758impl Graph for Subgraph {
1759    fn new() -> Self {
1760        unimplemented!();
1761    }
1762
1763    fn add_node(&mut self, _: &[u8], _: &[u8]) -> Result<(), String> {
1764        unimplemented!();
1765    }
1766
1767    fn add_edge(&mut self, _: &[u8], _: Orientation, _: &[u8], _: Orientation) -> Result<(), String> {
1768        unimplemented!();
1769    }
1770
1771    fn finalize(&mut self) -> Result<(), String> {
1772        Ok(())
1773    }
1774
1775    fn statistics(&self) -> (usize, usize, usize) {
1776        let node_count = self.nodes();
1777        let mut edge_count = 0;
1778        let mut seq_len = 0;
1779        for (handle, record) in self.records.iter() {
1780            let (from_id, from_o) = support::decode_node(*handle);
1781            if from_o == Orientation::Forward {
1782                seq_len += record.sequence_len();
1783            }
1784            for successor in record.successors() {
1785                let (to_id, to_o) = support::decode_node(successor);
1786                if self.has_node(to_id) && support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
1787                    edge_count += 1;
1788                }
1789            }
1790        }
1791        (node_count, edge_count, seq_len)
1792    }
1793
1794    fn node_iter(&self) -> impl Iterator<Item=Vec<u8>> {
1795        self.node_iter().map(|from_id| {
1796            let sequence = self.sequence(from_id).unwrap().to_vec();
1797            let mut node = NodeInt::new(Some(sequence));
1798            for from_o in [Orientation::Forward, Orientation::Reverse] {
1799                for (to_id, to_o) in self.successors(from_id, from_o).unwrap() {
1800                    if support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
1801                        node.edges.push((from_o, to_id, to_o));
1802                    }
1803                }
1804            }
1805            node.finalize();
1806            node.serialize(from_id)
1807        })
1808    }
1809}
1810
1811//-----------------------------------------------------------------------------
1812
1813/// A path position represented in multiple ways.
1814#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1815pub struct PathPosition {
1816    // Sequence offset from the start of the path.
1817    seq_offset: usize,
1818    // GBWT node identifier encoding an oriented node.
1819    gbwt_node: usize,
1820    // Offset from the start of the node.
1821    node_offset: usize,
1822    // Offset within GBWT record.
1823    gbwt_offset: usize,
1824}
1825
1826impl PathPosition {
1827    /// Returns the sequence offset in bp from the start of the path.
1828    #[inline]
1829    pub fn seq_offset(&self) -> usize {
1830        self.seq_offset
1831    }
1832
1833    /// Returns the node identifier.
1834    #[inline]
1835    pub fn node_id(&self) -> usize {
1836        support::node_id(self.gbwt_node)
1837    }
1838
1839    /// Returns the orientation of the node.
1840    #[inline]
1841    pub fn orientation(&self) -> Orientation {
1842        support::node_orientation(self.gbwt_node)
1843    }
1844
1845    /// Returns the offset from the start of the node.
1846    #[inline]
1847    pub fn node_offset(&self) -> usize {
1848        self.node_offset
1849    }
1850
1851    /// Returns the graph position.
1852    #[inline]
1853    pub fn graph_pos(&self) -> GraphPosition {
1854        GraphPosition {
1855            node: self.node_id(),
1856            orientation: self.orientation(),
1857            offset: self.node_offset(),
1858        }
1859    }
1860
1861    /// Returns the GBWT node identifier / handle.
1862    #[inline]
1863    pub fn handle(&self) -> usize {
1864        self.gbwt_node
1865    }
1866
1867    /// Returns the offset within the GBWT record.
1868    #[inline]
1869    pub fn gbwt_offset(&self) -> usize {
1870        self.gbwt_offset
1871    }
1872
1873    /// Returns the GBWT position for the oriented node containing the position.
1874    #[inline]
1875    pub fn gbwt_pos(&self) -> Pos {
1876        Pos {
1877            node: self.handle(),
1878            offset: self.gbwt_offset(),
1879        }
1880    }
1881}
1882
1883// TODO: Add hash of the path for faster comparisons?
1884#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
1885struct PathInfo {
1886    path: Vec<usize>,
1887    len: usize,
1888    weight: Option<usize>,
1889}
1890
1891impl PathInfo {
1892    fn new(path: Vec<usize>, len: usize) -> Self {
1893        PathInfo { path, len, weight: None }
1894    }
1895
1896    fn weighted(path: Vec<usize>, len: usize) -> Self {
1897        PathInfo { path, len, weight: Some(1) }
1898    }
1899
1900    fn increment_weight(&mut self) {
1901        if let Some(weight) = self.weight {
1902            self.weight = Some(weight + 1);
1903        }
1904    }
1905
1906    // Returns the sequence interval (in bp) for this path, assuming that `path[path_offset]` matches `path_pos`.
1907    // The sequence interval is for the part of the path contained in the subgraph.
1908    fn path_interval(&self, subgraph: &Subgraph, path_offset: usize, path_pos: &PathPosition) -> Range<usize> {
1909        // Distance from the start of the path to `path_pos`.
1910        let mut ref_pos = path_pos.node_offset();
1911        for handle in self.path.iter().take(path_offset) {
1912            ref_pos += subgraph.records.get(handle).unwrap().sequence_len();
1913        }
1914
1915        let start = path_pos.seq_offset() - ref_pos;
1916        let end = start + self.len;
1917        start..end
1918    }
1919}
1920
1921#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1922enum EditOperation {
1923    Match,
1924    Insertion,
1925    Deletion,
1926}
1927
1928impl Display for EditOperation {
1929    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1930        match self {
1931            EditOperation::Match => write!(f, "M"),
1932            EditOperation::Insertion => write!(f, "I"),
1933            EditOperation::Deletion => write!(f, "D"),
1934        }
1935    }
1936}
1937
1938//-----------------------------------------------------------------------------
1939
1940/// An iterator over the predecessors or successors of a node.
1941///
1942/// The type of `Item` is `(`[`usize`]`, `[`Orientation`]`)`.
1943/// These values encode the identifier and the orientation of the node.
1944/// Successor nodes are always listed in sorted order.
1945/// Predecessor nodes are sorted by their identifiers, but the reverse orientation is listed before the forward orientation.
1946/// Like [`gbz::gbz::EdgeIter`], but only for edges in the subgraph.
1947#[derive(Clone, Debug)]
1948pub struct EdgeIter<'a> {
1949    parent: &'a Subgraph,
1950    // Successors of the record, excluding the possible endmarker.
1951    successors: &'a [Pos],
1952    // The first outrank that has not been visited.
1953    next: usize,
1954    // The first outrank that we should not visit.
1955    limit: usize,
1956    // Flip the orientation in the iterated values.
1957    flip: bool,
1958    // Iterate over the edges in the supergraph, not the subgraph.
1959    supergraph: bool,
1960}
1961
1962
1963impl<'a> EdgeIter<'a> {
1964    /// Creates a new iterator over the successors of the record.
1965    ///
1966    /// If `flip` is `true`, the iterator will flip the orientation of the successors.
1967    /// This is effectively the same as listing the predecessors of the reverse orientation of the record.
1968    pub fn new(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
1969        let successors = record.edges_slice();
1970        let limit = successors.len();
1971        EdgeIter {
1972            parent,
1973            successors,
1974            next: 0,
1975            limit,
1976            flip,
1977            supergraph: false,
1978        }
1979    }
1980
1981    /// Creates a new iterator over the successors of the record in the supergraph.
1982    ///
1983    /// This is otherwise the same as [`Self::new`], but this also lists successors not in the subgraph.
1984    pub fn in_supergraph(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
1985        let successors = record.edges_slice();
1986        let limit = successors.len();
1987        EdgeIter {
1988            parent,
1989            successors,
1990            next: 0,
1991            limit,
1992            flip,
1993            supergraph: true,
1994        }
1995    }
1996}
1997
1998impl<'a> Iterator for EdgeIter<'a> {
1999    type Item = (usize, Orientation);
2000
2001    fn next(&mut self) -> Option<Self::Item> {
2002        while self.next < self.limit {
2003            let handle = self.successors[self.next].node;
2004            self.next += 1;
2005            if self.supergraph || self.parent.has_handle(handle) {
2006                let node_id = support::node_id(handle);
2007                let orientation = if self.flip {
2008                    support::node_orientation(handle).flip()
2009                } else {
2010                    support::node_orientation(handle)
2011                };
2012                return Some((node_id, orientation));
2013            }
2014        }
2015        None
2016    }
2017}
2018
2019impl<'a> DoubleEndedIterator for EdgeIter<'a> {
2020    fn next_back(&mut self) -> Option<Self::Item> {
2021        while self.next < self.limit {
2022            let handle = self.successors[self.limit - 1].node;
2023            self.limit -= 1;
2024            if self.supergraph || self.parent.has_handle(handle) {
2025                let node_id = support::node_id(handle);
2026                let orientation = if self.flip {
2027                    support::node_orientation(handle).flip()
2028                } else {
2029                    support::node_orientation(handle)
2030                };
2031                return Some((node_id, orientation));
2032            }
2033        }
2034        None
2035    }
2036}
2037
2038impl<'a> FusedIterator for EdgeIter<'a> {}
2039
2040//-----------------------------------------------------------------------------