gbz_base/subgraph.rs
1//! A subgraph in a GBZ graph.
2//!
3//! This module provides functionality for extracting a subgraph around a specific position or interval of a specific path.
4//! The subgraph contains all nodes within a given context and all edges between them.
5//! It can also be extended to include all nodes in top-level snarls with boundary nodes already in the subgraph.
6//! All other paths within the subgraph can also be extracted, but they will not have any true metadata associated with them.
7
8use crate::{GBZRecord, GBZPath};
9use crate::{GraphInterface, GraphReference};
10use crate::PathIndex;
11use crate::{SubgraphQuery, HaplotypeOutput};
12use crate::subgraph::query::QueryType;
13use crate::formats::{self, WalkMetadata, JSONValue};
14
15use std::cmp::Reverse;
16use std::collections::{BinaryHeap, BTreeMap, BTreeSet, HashSet};
17use std::fmt::Display;
18use std::io::{self, Write};
19use std::iter::FusedIterator;
20use std::ops::Range;
21use std::cmp;
22
23use gbz::ENDMARKER;
24use gbz::{GBZ, GraphPosition, Orientation, NodeSide, Pos, FullPathName};
25use gbz::support::Chains;
26use gbz::{algorithms, support};
27
28use pggname::{Graph, GraphName};
29use pggname::graph::NodeInt;
30
31#[cfg(test)]
32mod tests;
33
34pub mod query;
35
36//-----------------------------------------------------------------------------
37
38/// A subgraph induced by a set of node identifiers.
39///
40/// The subgraph contains the specified nodes and the edges between them.
41/// Paths in the current subgraph can be extracted using [`Subgraph::extract_paths`].
42/// Only the provided reference path will have proper metadata.
43/// Other paths remain anonymous, as we cannot identify them efficiently using the GBWT.
44/// Any changes to the subgraph will remove the extracted paths.
45/// When a subgraph is changed, the operations will try to reuse already extracted node records.
46/// This makes it viable as a sliding window over a reference path.
47///
48/// Node identifiers can be selected using:
49/// * [`Subgraph::add_node_from_gbz`], [`Subgraph::add_node_from_db`], and [`Subgraph::remove_node`] with individual ids.
50/// * [`Subgraph::around_position`] for a context around a graph position.
51/// * [`Subgraph::around_interval`] for a context around path interval.
52/// * [`Subgraph::around_nodes`] for a context around a set of nodes.
53/// * [`Subgraph::between_nodes`] for all nodes and snarls in an interval of a chain.
54/// * [`Subgraph::extract_snarls`] to include all nodes in top-level snarls with boundary nodes in the current subgraph.
55///
56/// [`Subgraph::from_gbz`] and [`Subgraph::from_db`] are integrated methods for extracting a subgraph with paths using a [`SubgraphQuery`].
57///
58/// `Subgraph` implements a similar graph interface to the node/edge operations of [`GBZ`].
59/// It can also be serialized in GFA and JSON formats using [`Subgraph::write_gfa`] and [`Subgraph::write_json`].
60///
61/// [`Subgraph`] also implements [`Graph`] from the [`pggname`].
62/// That enables computing stable graph names using [`pggname::stable_name`].
63///
64/// # Examples
65///
66/// The following example replicates an offset-based [`Subgraph::from_gbz`] query using lower-level functions.
67/// It also demonstrates some parts of the graph interface in the subgraph object.
68///
69/// ```
70/// use gbz_base::{GBZRecord, GraphReference, PathIndex, Subgraph, HaplotypeOutput};
71/// use gbz::{GBZ, FullPathName, Orientation};
72/// use gbz::support;
73/// use simple_sds::serialize;
74///
75/// // Get the graph.
76/// let gbz_file = support::get_test_data("example.gbz");
77/// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
78///
79/// // Create a path index with 3 bp intervals.
80/// let path_index = PathIndex::new(&graph, 3, false).unwrap();
81///
82/// // Find the position for path A offset 2.
83/// let mut query_pos = FullPathName::generic("A");
84/// query_pos.fragment = 2;
85/// let mut subgraph = Subgraph::new();
86/// let result = subgraph.path_pos_from_gbz(&graph, &path_index, &query_pos);
87/// assert!(result.is_ok());
88/// let (path_pos, path_name) = result.unwrap();
89///
90/// // Extract a 1 bp context around the position.
91/// let result = subgraph.around_position(GraphReference::Gbz(&graph), path_pos.graph_pos(), 1);
92/// assert!(result.is_ok());
93///
94/// // Extract all paths in the subgraph.
95/// let result = subgraph.extract_paths(Some((path_pos, path_name)), HaplotypeOutput::All);
96/// assert!(result.is_ok());
97///
98/// // Compute the stable name of the subgraph with no parent graph.
99/// subgraph.compute_name(None);
100/// assert!(subgraph.has_graph_name());
101///
102/// // The subgraph should be centered around 1 bp node 14 of degree 4.
103/// let nodes = [12, 13, 14, 15, 16];
104/// assert_eq!(subgraph.nodes(), nodes.len());
105/// assert!(subgraph.node_iter().eq(nodes.iter().copied()));
106/// assert_eq!(subgraph.paths(), 3);
107///
108/// // The result is a subgraph induced by the nodes.
109/// for &node_id in nodes.iter() {
110/// assert!(subgraph.has_node(node_id));
111/// assert_eq!(subgraph.sequence(node_id), graph.sequence(node_id));
112/// for orientation in [Orientation::Forward, Orientation::Reverse] {
113/// // Successors are present if they are in the subgraph.
114/// let sub_succ = subgraph.successors(node_id, orientation).unwrap();
115/// let graph_succ = graph.successors(node_id, orientation).unwrap();
116/// assert!(sub_succ.eq(graph_succ.filter(|(id, _)| nodes.contains(id))));
117/// // And the same applies to predecessors.
118/// let sub_pred = subgraph.predecessors(node_id, orientation).unwrap();
119/// let graph_pred = graph.predecessors(node_id, orientation).unwrap();
120/// assert!(sub_pred.eq(graph_pred.filter(|(id, _)| nodes.contains(id))));
121/// }
122/// }
123/// ```
124#[derive(Clone, Debug, Default, PartialEq, Eq)]
125pub struct Subgraph {
126 // Node records for the subgraph.
127 records: BTreeMap<usize, GBZRecord>,
128
129 // Paths in the subgraph.
130 paths: Vec<PathInfo>,
131
132 // Offset in `paths` for the reference path, if any.
133 ref_id: Option<usize>,
134
135 // Metadata for the reference path, if any.
136 ref_path: Option<FullPathName>,
137
138 // Interval of the reference path that is present in the subgraph, if any.
139 ref_interval: Option<Range<usize>>,
140
141 // Stable graph name.
142 graph_name: Option<GraphName>,
143}
144
145//-----------------------------------------------------------------------------
146
147/// Construction.
148impl Subgraph {
149 /// Creates a new empty subgraph.
150 pub fn new() -> Self {
151 Subgraph::default()
152 }
153
154 /// Returns the path position for the haplotype offset represented by the query position.
155 ///
156 /// `query_pos.fragment` is used as an offset in the haplotype.
157 /// The return value consists of the position and metadata for the path covering the position.
158 /// Updates the subgraph to include the path from the nearest indexed position to the query position.
159 ///
160 /// # Arguments
161 ///
162 /// * `graph`: A GBZ graph.
163 /// * `path_index`: A path index for the graph.
164 /// * `query_pos`: Query position.
165 ///
166 /// # Errors
167 ///
168 /// Returns an error if there is no path covering the given position or the path has not been indexed for random access.
169 ///
170 /// # Examples
171 ///
172 /// ```
173 /// use gbz_base::{GBZRecord, Subgraph, GraphReference, PathIndex};
174 /// use gbz::{GBZ, FullPathName, Orientation};
175 /// use gbz::support;
176 /// use simple_sds::serialize;
177 ///
178 /// // Get the graph.
179 /// let gbz_file = support::get_test_data("example.gbz");
180 /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
181 ///
182 /// // Create a path index with 3 bp intervals.
183 /// let path_index = PathIndex::new(&graph, 3, false).unwrap();
184 ///
185 /// // Query for path A offset 2.
186 /// let mut query_pos = FullPathName::generic("A");
187 /// query_pos.fragment = 2;
188 /// let mut subgraph = Subgraph::new();
189 /// let result = subgraph.path_pos_from_gbz(&graph, &path_index, &query_pos);
190 /// assert!(result.is_ok());
191 /// let (path_pos, path_name) = result.unwrap();
192 /// assert_eq!(path_pos.seq_offset(), query_pos.fragment);
193 ///
194 /// // The query position is at the start of node 14 in forward orientation.
195 /// assert_eq!(path_pos.node_id(), 14);
196 /// assert_eq!(path_pos.orientation(), Orientation::Forward);
197 /// assert_eq!(path_pos.node_offset(), 0);
198 ///
199 /// // And this happens to be the first path visit to the node.
200 /// let gbwt_node = support::encode_node(14, Orientation::Forward);
201 /// assert_eq!(path_pos.handle(), gbwt_node);
202 /// assert_eq!(path_pos.gbwt_offset(), 0);
203 ///
204 /// // And it is covered by the path fragment starting from offset 0.
205 /// assert_eq!(path_name, FullPathName::generic("A"));
206 ///
207 /// // Now extract 1 bp context around interval 2..4.
208 /// let result = subgraph.around_interval(GraphReference::Gbz(&graph), path_pos, 2, 1);
209 /// assert!(result.is_ok());
210 ///
211 /// // The interval corresponds to nodes 14 and 15.
212 /// let true_nodes = vec![12, 13, 14, 15, 16, 17];
213 /// assert_eq!(subgraph.nodes(), true_nodes.len());
214 /// assert!(subgraph.node_iter().eq(true_nodes.iter().copied()));
215 /// ```
216 pub fn path_pos_from_gbz(
217 &mut self,
218 graph: &GBZ,
219 path_index: &PathIndex,
220 query_pos: &FullPathName
221 ) -> Result<(PathPosition, FullPathName), String> {
222 let path = GBZPath::with_name(graph, query_pos).ok_or_else(||
223 format!("Cannot find a path covering {}", query_pos)
224 )?;
225 // Transform the offset relative to the haplotype to the offset relative to the path.
226 let query_offset = query_pos.fragment - path.name.fragment;
227
228 // Path id to an indexed position.
229 let index_offset = path_index.path_to_offset(path.handle).ok_or_else(||
230 format!("Path {} has not been indexed for random access", path.name())
231 )?;
232 let (path_offset, pos) = path_index.indexed_position(index_offset, query_offset).unwrap();
233
234 self.find_path_pos(GraphReference::Gbz(graph), query_offset, path_offset, pos, path.name)
235 }
236
237 /// Returns the path position for the haplotype offset represented by the query position.
238 ///
239 /// `query_pos.fragment` is used as an offset in the haplotype.
240 /// The return value consists of the position and metadata for the path covering the position.
241 /// Updates the subgraph to include the path from the nearest indexed position to the query position.
242 ///
243 /// # Arguments
244 ///
245 /// * `graph`: A graph interface.
246 /// * `query_pos`: Query position.
247 ///
248 /// # Errors
249 ///
250 /// Returns an error if database operations fail.
251 /// Returns an error if there is no path covering the given position or the path has not been indexed for random access.
252 ///
253 /// # Examples
254 ///
255 /// ```
256 /// use gbz_base::{GBZBase, GraphInterface, Subgraph};
257 /// use gbz::{FullPathName, Orientation};
258 /// use gbz::support;
259 /// use simple_sds::serialize;
260 /// use std::fs;
261 ///
262 /// // Create the database.
263 /// let gbz_file = support::get_test_data("example.gbz");
264 /// let db_file = serialize::temp_file_name("subgraph");
265 /// let result = GBZBase::create_from_files(&gbz_file, None, &db_file);
266 /// assert!(result.is_ok());
267 ///
268 /// // Open the database and create a graph interface.
269 /// let database = GBZBase::open(&db_file).unwrap();
270 /// let mut interface = GraphInterface::new(&database).unwrap();
271 ///
272 /// // Query for path A offset 2.
273 /// let mut query_pos = FullPathName::generic("A");
274 /// query_pos.fragment = 2;
275 /// let mut subgraph = Subgraph::new();
276 /// let result = subgraph.path_pos_from_db(&mut interface, &query_pos);
277 /// assert!(result.is_ok());
278 /// let (path_pos, path_name) = result.unwrap();
279 /// assert_eq!(path_pos.seq_offset(), query_pos.fragment);
280 ///
281 /// // The query position is at the start of node 14 in forward orientation.
282 /// assert_eq!(path_pos.node_id(), 14);
283 /// assert_eq!(path_pos.orientation(), Orientation::Forward);
284 /// assert_eq!(path_pos.node_offset(), 0);
285 ///
286 /// // And this happens to be the first path visit to the node.
287 /// let gbwt_node = support::encode_node(14, Orientation::Forward);
288 /// assert_eq!(path_pos.handle(), gbwt_node);
289 /// assert_eq!(path_pos.gbwt_offset(), 0);
290 ///
291 /// // And it is covered by the path fragment starting from offset 0.
292 /// assert_eq!(path_name, FullPathName::generic("A"));
293 ///
294 /// // Clean up.
295 /// drop(interface);
296 /// drop(database);
297 /// fs::remove_file(&db_file).unwrap();
298 /// ```
299 pub fn path_pos_from_db<'reference, 'graph>(
300 &mut self,
301 graph: &'reference mut GraphInterface<'graph>,
302 query_pos: &FullPathName
303 ) -> Result<(PathPosition, FullPathName), String> {
304 let path = graph.find_path(query_pos)?;
305 let path = path.ok_or_else(|| format!("Cannot find a path covering {}", query_pos))?;
306 if !path.is_indexed {
307 return Err(format!("Path {} has not been indexed for random access", query_pos));
308 }
309 // Transform the offset relative to the haplotype to the offset relative to the path.
310 let query_offset = query_pos.fragment - path.name.fragment;
311
312 // Find an indexed position before the query position.
313 let result = graph.indexed_position(path.handle, query_offset)?;
314 let (path_offset, pos) = result.ok_or_else(||
315 format!("Path {} has not been indexed for random access", path.name())
316 )?;
317
318 self.find_path_pos(GraphReference::Db(graph), query_offset, path_offset, pos, path.name)
319 }
320
321 // Shared functionality for `path_pos_from_gbz` and `path_pos_from_db`.
322 fn find_path_pos(
323 &mut self,
324 graph: GraphReference<'_, '_>,
325 query_offset: usize,
326 path_offset: usize,
327 pos: Pos,
328 path_name: FullPathName
329 ) -> Result<(PathPosition, FullPathName), String> {
330 // Iterate over the path until the query position, updating the subgraph.
331 let mut path_offset = path_offset;
332 let mut pos = pos;
333 let mut node_offset: Option<usize> = None;
334 let mut gbwt_pos: Option<Pos> = None;
335 let mut graph = graph;
336 loop {
337 let node_id = support::node_id(pos.node);
338 if !self.has_node(node_id) {
339 self.add_node_internal(&mut graph, node_id)?;
340 }
341 let record = self.record(pos.node).unwrap();
342 if path_offset + record.sequence_len() > query_offset {
343 node_offset = Some(query_offset - path_offset);
344 gbwt_pos = Some(pos);
345 break;
346 }
347 path_offset += record.sequence_len();
348 let next = record.to_gbwt_record().lf(pos.offset);
349 if next.is_none() {
350 break;
351 }
352 pos = next.unwrap();
353 }
354
355 let node_offset = node_offset.ok_or_else(||
356 format!("Path {} does not contain offset {}", path_name, query_offset)
357 )?;
358 let gbwt_pos = gbwt_pos.unwrap();
359
360 let path_position = PathPosition {
361 seq_offset: query_offset,
362 gbwt_node: gbwt_pos.node,
363 node_offset,
364 gbwt_offset: gbwt_pos.offset,
365 };
366 Ok((path_position, path_name))
367 }
368
369 /// Updates the subgraph to a context around the given graph position.
370 ///
371 /// Reuses existing records when possible.
372 /// Removes node records outside the context as well as all path information.
373 /// Returns the number of inserted and removed nodes.
374 /// See [`Subgraph`] for an example.
375 ///
376 /// # Arguments
377 ///
378 /// * `graph`: A GBZ-compatible graph.
379 /// * `pos`: The reference position for the subgraph.
380 /// * `context`: The context length around the reference position (in bp).
381 ///
382 /// # Errors
383 ///
384 /// Passes through any errors from accessing the graph.
385 pub fn around_position(
386 &mut self,
387 graph: GraphReference<'_, '_>,
388 pos: GraphPosition,
389 context: usize
390 ) -> Result<(usize, usize), String> {
391 // The initial node is always in the subgraph, so we might as well add it now to determine sequence length.
392 let mut graph = graph;
393 if !self.has_node(pos.node) {
394 self.add_node_internal(&mut graph, pos.node)?;
395 }
396 let handle = pos.to_gbwt();
397 let record = self.record(handle).unwrap();
398
399 // Start the graph traversal from both sides of the initial node.
400 let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
401 let start = (pos.node, support::entry_side(pos.orientation));
402 active.push(Reverse((pos.offset, start)));
403 let end_distance = record.sequence_len() - pos.offset - 1;
404 let end = (pos.node, support::exit_side(pos.orientation));
405 active.push(Reverse((end_distance, end)));
406
407 self.insert_context(graph, active, context)
408 }
409
410 /// Updates the subgraph to a context around the given path interval.
411 ///
412 /// Reuses existing records when possible.
413 /// Removes node records outside the context as well as all path information.
414 /// Returns the number of inserted and removed nodes.
415 /// See [`Self::path_pos_from_gbz`] for an example.
416 ///
417 /// # Arguments
418 ///
419 /// * `graph`: A GBZ-compatible graph.
420 /// * `start_pos`: The starting position for the interval.
421 /// * `len`: Length of the interval (in bp).
422 /// * `context`: The context length around the reference interval (in bp).
423 ///
424 /// # Errors
425 ///
426 /// Passes through any errors from accessing the graph.
427 /// Returns an error if the interval is empty, starts outside the initial node, or is longer than the remaining path.
428 pub fn around_interval(
429 &mut self,
430 graph: GraphReference<'_, '_>,
431 start_pos: PathPosition,
432 len: usize,
433 context: usize
434 ) -> Result<(usize, usize), String> {
435 if len == 0 {
436 return Err(String::from("Interval length must be greater than 0"));
437 }
438 let mut pos = start_pos.gbwt_pos();
439 let mut offset = start_pos.node_offset();
440 let mut len = len;
441
442 // Insert all nodes in the interval to the subgraph and to active node sides.
443 let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
444 let mut graph = graph;
445 loop {
446 let node_id = support::node_id(pos.node);
447 let orientation = support::node_orientation(pos.node);
448 if !self.has_node(node_id) {
449 self.add_node_internal(&mut graph, node_id)?;
450 }
451 let record = self.record(pos.node).unwrap();
452 if offset >= record.sequence_len() {
453 return Err(format!("Offset {} in node {} of length {}", offset, node_id, record.sequence_len()));
454 }
455
456 // Handle the current node.
457 let start = (node_id, support::entry_side(orientation));
458 active.push(Reverse((offset, start)));
459 let end = (node_id, support::exit_side(orientation));
460 let distance_to_next = record.sequence_len() - offset;
461 if len <= distance_to_next {
462 let end_distance = if len == distance_to_next { 0 } else { distance_to_next - len - 1 };
463 active.push(Reverse((end_distance, end)));
464 break;
465 } else {
466 active.push(Reverse((0, end)));
467 }
468
469 // Proceed to the next node.
470 if let Some(next) = record.to_gbwt_record().lf(pos.offset) {
471 pos = next;
472 } else {
473 return Err(format!("No successor for GBWT position ({}, {})", pos.node, pos.offset));
474 }
475 offset = 0;
476 len -= distance_to_next;
477 }
478
479 self.insert_context(graph, active, context)
480 }
481
482 /// Updates the subgraph to a context around the given nodes.
483 ///
484 /// Reuses existing records when possible.
485 /// Removes node records outside the context as well as all path information.
486 /// Returns the number of inserted and removed nodes.
487 ///
488 /// # Arguments
489 ///
490 /// * `graph`: A GBZ-compatible graph.
491 /// * `nodes`: Set of reference nodes for the subgraph.
492 /// * `context`: The context length around the reference node (in bp).
493 ///
494 /// # Errors
495 ///
496 /// Passes through any errors from accessing the graph.
497 ///
498 /// # Examples
499 ///
500 /// ```
501 /// use gbz_base::{GBZRecord, Subgraph, GraphReference};
502 /// use gbz::GBZ;
503 /// use gbz::support;
504 /// use simple_sds::serialize;
505 /// use std::collections::BTreeSet;
506 ///
507 /// // Get the graph.
508 /// let gbz_file = support::get_test_data("translation.gbz");
509 /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
510 ///
511 /// // Extract a subgraph that contains an 1 bp context around node 5.
512 /// let mut subgraph = Subgraph::new();
513 /// let mut nodes = BTreeSet::new();
514 /// nodes.insert(5);
515 /// let result = subgraph.around_nodes(GraphReference::Gbz(&graph), &nodes, 1);
516 /// assert!(result.is_ok());
517 ///
518 /// // The subgraph should be centered around 2 bp node 5 of degree 3.
519 /// let true_nodes = [3, 4, 5, 6];
520 /// assert_eq!(subgraph.nodes(), true_nodes.len());
521 /// assert!(subgraph.node_iter().eq(true_nodes.iter().copied()));
522 ///
523 /// // But there are no paths.
524 /// assert_eq!(subgraph.paths(), 0);
525 /// ```
526 pub fn around_nodes(
527 &mut self,
528 graph: GraphReference<'_, '_>,
529 nodes: &BTreeSet<usize>,
530 context: usize
531 ) -> Result<(usize, usize), String> {
532 // Start the graph traversal from both sides of the initial nodes.
533 let mut active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>> = BinaryHeap::new();
534 for &node_id in nodes {
535 let start = (node_id, NodeSide::Left);
536 active.push(Reverse((0, start)));
537 let end = (node_id, NodeSide::Right);
538 active.push(Reverse((0, end)));
539 }
540
541 self.insert_context(graph, active, context)
542 }
543
544 // Inserts all nodes within the context, starting from the active node sides.
545 //
546 // Returns the number of inserted and removed nodes.
547 // All nodes in `active` are assumed to be in the subgraph, even if the distances to the sides exceed the context.
548 // Reuses existing records if possible.
549 // Removes existing nodes that are not in the new context.
550 // Clears all path information.
551 fn insert_context(
552 &mut self,
553 graph: GraphReference<'_, '_>,
554 active: BinaryHeap<Reverse<(usize, (usize, NodeSide))>>,
555 context: usize
556 ) -> Result<(usize, usize), String> {
557 self.clear_paths();
558
559 let mut active = active;
560 let mut visited: BTreeSet<(usize, NodeSide)> = BTreeSet::new();
561 let mut to_remove: BTreeSet<usize> = self.node_iter().collect();
562 let mut inserted = 0;
563 let mut graph = graph;
564
565 while !active.is_empty() {
566 let (distance, node_side) = active.pop().unwrap().0;
567 if visited.contains(&node_side) {
568 continue;
569 }
570 visited.insert(node_side);
571 to_remove.remove(&node_side.0);
572 if !self.has_node(node_side.0) {
573 self.add_node_internal(&mut graph, node_side.0)?;
574 inserted += 1;
575 }
576
577 // We can reach the other side by traversing the node.
578 let other_side = (node_side.0, node_side.1.flip());
579 if !visited.contains(&other_side) {
580 let handle = support::encode_node(node_side.0, support::entry_orientation(node_side.1));
581 let record = self.record(handle).unwrap();
582 let next_distance = distance + record.sequence_len() - 1;
583 if next_distance <= context {
584 active.push(Reverse((next_distance, other_side)));
585 }
586 }
587
588 // The predecessors of this node side are 1 bp away.
589 let handle = support::encode_node(node_side.0, support::exit_orientation(node_side.1));
590 let record = self.record(handle).unwrap();
591 let next_distance = distance + 1;
592 if next_distance <= context {
593 for successor in record.successors() {
594 let node_id = support::node_id(successor);
595 let side = support::entry_side(support::node_orientation(successor));
596 if !visited.contains(&(node_id, side)) {
597 active.push(Reverse((next_distance, (node_id, side))));
598 }
599 }
600 }
601 }
602
603 let removed = to_remove.len();
604 for node_id in to_remove {
605 self.remove_node_internal(node_id);
606 }
607 Ok((inserted, removed))
608 }
609
610 /// Inserts all nodes between the given two handles into the subgraph.
611 ///
612 /// If `start` and `end` are in the same chain in the given order, this will insert all nodes and snarls between them.
613 /// Otherwise the behavior will be unpredictable and it is recommended to set a safety limit to avoid excessive memory usage.
614 /// Removes all path information.
615 /// Returns the number of inserted nodes.
616 ///
617 /// # Arguments
618 ///
619 /// * `graph`: A GBZ-compatible graph.
620 /// * `start`: Start handle (inclusive).
621 /// * `end`: End handle (inclusive).
622 /// * `limit`: Optional safety limit on the number of inserted nodes.
623 ///
624 /// # Examples
625 ///
626 /// ```
627 /// use gbz_base::{GBZRecord, Subgraph, GraphReference};
628 /// use gbz::{GBZ, Orientation};
629 /// use gbz::support;
630 /// use simple_sds::serialize;
631 ///
632 /// let graph_file = support::get_test_data("example.gbz");
633 /// let graph: GBZ = serialize::load_from(&graph_file).unwrap();
634 ///
635 /// let start = support::encode_node(14, Orientation::Forward);
636 /// let end = support::encode_node(17, Orientation::Forward);
637 /// let mut subgraph = Subgraph::new();
638 /// let result = subgraph.between_nodes(GraphReference::Gbz(&graph), start, end, None);
639 /// assert_eq!(result, Ok(4));
640 ///
641 /// let expected_nodes = [14, 15, 16, 17];
642 /// assert!(subgraph.node_iter().eq(expected_nodes.iter().copied()));
643 /// ```
644 pub fn between_nodes(&mut self, graph: GraphReference<'_, '_>, start: usize, end: usize, limit: Option<usize>) -> Result<usize, String> {
645 self.clear_paths();
646
647 // Active handles. We proceed to their successors but not predecessors.
648 let mut active = vec![start, support::flip_node(end)];
649 // Visited node identifiers.
650 let mut visited = HashSet::new();
651 visited.insert(support::decode_node(start).0);
652 visited.insert(support::decode_node(end).0);
653
654 // Determine node ids between the boundary nodes, inclusive.
655 // Add them to the subgraph if they are not already present.
656 let mut graph = graph;
657 let mut inserted = 0;
658 while let Some(curr) = active.pop() {
659 let (node_id, orientation) = support::decode_node(curr);
660 if !self.has_node(node_id) {
661 if let Some(limit) = limit && inserted >= limit {
662 let (start_id, start_o) = support::decode_node(start);
663 let (end_id, end_o) = support::decode_node(end);
664 return Err(format!("Found more than {} new nodes between ({}, {}) and ({}, {})", limit, start_id, start_o, end_id, end_o));
665 }
666 self.add_node_internal(&mut graph, node_id)?;
667 inserted += 1;
668 }
669
670 for (next_id, next_o) in self.supergraph_successors(node_id, orientation).unwrap() {
671 if visited.contains(&next_id) {
672 continue;
673 }
674 let fw_handle = support::encode_node(next_id, next_o);
675 let rev_handle = support::flip_node(fw_handle);
676 active.push(fw_handle); active.push(rev_handle);
677 visited.insert(next_id);
678 }
679 }
680
681 Ok(inserted)
682 }
683
684 /// Extracts nodes in top-level snarls covered by the current subgraph.
685 ///
686 /// Returns the number of inserted nodes.
687 /// Removes all path information.
688 /// If a chains are provided, this will use them for determining the top-level snarls.
689 /// Otherwise the links stored in node records are used.
690 /// Note that chains must be always provided when the subgraph has been extracted from a [`GBZ`] graph.
691 ///
692 /// # Errors
693 ///
694 /// Returns an error if the graph reference is [`GraphReference::None`].
695 /// Passes through errors from accessing the graph.
696 ///
697 /// # Examples
698 ///
699 /// ```
700 /// use gbz::{GBZ, Orientation};
701 /// use gbz::support::{self, Chains};
702 /// use gbz_base::{Subgraph, GraphReference};
703 /// use gbz_base::utils;
704 /// use simple_sds::serialize;
705 /// use std::collections::BTreeSet;
706 ///
707 /// let graph_file = support::get_test_data("example.gbz");
708 /// let graph: GBZ = serialize::load_from(&graph_file).unwrap();
709 /// let chains_file = utils::get_test_data("example.chains");
710 /// let chains: Chains = serialize::load_from(&chains_file).unwrap();
711 ///
712 /// // Extract the boundary nodes of a snarl as the initial subgraph.
713 /// let mut subgraph = Subgraph::new();
714 /// let nodes: BTreeSet<usize> = [14, 17].into_iter().collect();
715 /// let result = subgraph.around_nodes(GraphReference::Gbz(&graph), &nodes, 0);
716 /// assert!(result.is_ok());
717 /// assert!(subgraph.node_iter().eq(nodes.iter().copied()));
718 ///
719 /// // Now extract the snarl between the boundary nodes.
720 /// let result = subgraph.extract_snarls(GraphReference::Gbz(&graph), Some(&chains));
721 /// assert!(result.is_ok());
722 /// let snarl_nodes = [14, 15, 16, 17];
723 /// assert!(subgraph.node_iter().eq(snarl_nodes.iter().copied()));
724 /// ```
725 pub fn extract_snarls(&mut self, graph: GraphReference<'_, '_>, chains: Option<&Chains>) -> Result<usize, String> {
726 let snarls = self.covered_snarls(chains);
727
728 let mut inserted = 0;
729 let mut graph = graph;
730 for (start, end) in snarls {
731 match &mut graph {
732 GraphReference::Gbz(graph) => {
733 inserted += self.between_nodes(GraphReference::Gbz(graph), start, end, None)?;
734 }
735 GraphReference::Db(graph) => {
736 inserted += self.between_nodes(GraphReference::Db(graph), start, end, None)?;
737 },
738 GraphReference::None => {
739 return Err(String::from("No graph reference provided"));
740 }
741 }
742 }
743
744 Ok(inserted)
745 }
746
747 /// Extracts a subgraph around the given query position.
748 ///
749 /// Reuses existing records when possible.
750 /// Removes node records not covered by the query.
751 ///
752 /// # Arguments
753 ///
754 /// * `graph`: A GBZ graph.
755 /// * `path_index`: A path index for the graph, if the query is path-based.
756 /// * `chains`: A set of top-level chains, if the query extracts nodes in covered snarls.
757 /// * `query`: Arguments for extracting the subgraph.
758 ///
759 /// # Errors
760 ///
761 /// Returns an error, if:
762 ///
763 /// * The query or the graph is invalid.
764 /// * The graph does not contain the queried position.
765 /// * A path index is required but not provided, or if the query path has not been indexed.
766 ///
767 /// If an error occurs, the subgraph may contain arbitrary nodes but no paths.
768 ///
769 /// # Examples
770 ///
771 /// ```
772 /// use gbz_base::{PathIndex, Subgraph, SubgraphQuery, HaplotypeOutput};
773 /// use gbz::{GBZ, FullPathName};
774 /// use gbz::support;
775 /// use simple_sds::serialize;
776 ///
777 /// // Get the graph.
778 /// let gbz_file = support::get_test_data("example.gbz");
779 /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
780 ///
781 /// // Create a path index with 3 bp intervals.
782 /// let path_index = PathIndex::new(&graph, 3, false).unwrap();
783 ///
784 /// // Extract a subgraph that contains an 1 bp context around path A offset 2.
785 /// let path_name = FullPathName::generic("A");
786 /// let query = SubgraphQuery::path_offset(&path_name, 2).with_context(1);
787 /// let mut subgraph = Subgraph::new();
788 /// let result = subgraph.from_gbz(&graph, Some(&path_index), None, &query);
789 /// assert!(result.is_ok());
790 ///
791 /// // The subgraph should be centered around 1 bp node 14 of degree 4.
792 /// assert_eq!(subgraph.nodes(), 5);
793 /// assert_eq!(subgraph.paths(), 3);
794 ///
795 /// // We get the same result using a node id.
796 /// let query = SubgraphQuery::nodes([14]).with_context(1);
797 /// let mut subgraph = Subgraph::new();
798 /// let result = subgraph.from_gbz(&graph, None, None, &query);
799 /// assert!(result.is_ok());
800 /// assert_eq!(subgraph.nodes(), 5);
801 /// assert_eq!(subgraph.paths(), 3);
802 /// ```
803 pub fn from_gbz(&mut self, graph: &GBZ, path_index: Option<&PathIndex>, chains: Option<&Chains>, query: &SubgraphQuery) -> Result<(), String> {
804 if query.snarls() && chains.is_none() {
805 return Err(String::from("Top-level chains are required for extracting snarls"));
806 }
807 match query.query_type() {
808 QueryType::PathOffset(query_pos) => {
809 let path_index = path_index.ok_or_else(||
810 String::from("Path index is required for path-based queries")
811 )?;
812 let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
813 self.around_position(GraphReference::Gbz(graph), reference_path.0.graph_pos(), query.context())?;
814 if query.snarls() {
815 self.extract_snarls(GraphReference::Gbz(graph), chains)?;
816 }
817 self.extract_paths(Some(reference_path), query.output())?;
818 },
819 QueryType::PathInterval(query_pos, len) => {
820 let path_index = path_index.ok_or_else(||
821 String::from("Path index is required for path-based queries")
822 )?;
823 let reference_path = self.path_pos_from_gbz(graph, path_index, query_pos)?;
824 self.around_interval(GraphReference::Gbz(graph), reference_path.0, *len, query.context())?;
825 if query.snarls() {
826 self.extract_snarls(GraphReference::Gbz(graph), chains)?;
827 }
828 self.extract_paths(Some(reference_path), query.output())?;
829 },
830 QueryType::Nodes(nodes) => {
831 if query.output() == HaplotypeOutput::ReferenceOnly {
832 return Err(String::from("Cannot output a reference path in a node-based query"));
833 }
834 self.around_nodes(GraphReference::Gbz(graph), nodes, query.context())?;
835 if query.snarls() {
836 self.extract_snarls(GraphReference::Gbz(graph), chains)?;
837 }
838 self.extract_paths(None, query.output())?;
839 },
840 QueryType::Between((start, end), limit) => {
841 if query.output() == HaplotypeOutput::ReferenceOnly {
842 return Err(String::from("Cannot output a reference path in a node-based query"));
843 }
844 self.between_nodes(GraphReference::Gbz(graph), *start, *end, *limit)?;
845 self.extract_paths(None, query.output())?;
846 },
847 }
848
849 // Determine the stable graph name and relationships.
850 let parent = GraphName::from_gbz(graph);
851 self.compute_name(Some(&parent));
852
853 Ok(())
854 }
855
856 /// Extracts a subgraph around the given query position.
857 ///
858 /// Reuses existing records when possible.
859 /// Removes node records not covered by the query.
860 ///
861 /// # Errors
862 ///
863 /// Returns an error, if:
864 ///
865 /// * The query or the graph is invalid or if there is a database error.
866 /// * The graph does not contain the queried position.
867 /// * A path index is required but not provided, or if the query path has not been indexed.
868 ///
869 /// If an error occurs, the subgraph may contain arbitrary nodes but no paths.
870 ///
871 /// # Examples
872 ///
873 /// ```
874 /// use gbz_base::{GBZBase, GraphInterface, Subgraph, SubgraphQuery, HaplotypeOutput};
875 /// use gbz::FullPathName;
876 /// use gbz::support;
877 /// use simple_sds::serialize;
878 /// use std::fs;
879 ///
880 /// // Create the database.
881 /// let gbz_file = support::get_test_data("example.gbz");
882 /// let db_file = serialize::temp_file_name("subgraph");
883 /// let result = GBZBase::create_from_files(&gbz_file, None, &db_file);
884 /// assert!(result.is_ok());
885 ///
886 /// // Open the database and create a graph interface.
887 /// let database = GBZBase::open(&db_file).unwrap();
888 /// let mut interface = GraphInterface::new(&database).unwrap();
889 ///
890 /// // Extract a subgraph that contains an 1 bp context around path A offset 2.
891 /// let path_name = FullPathName::generic("A");
892 /// let query = SubgraphQuery::path_offset(&path_name, 2).with_context(1);
893 /// let mut subgraph = Subgraph::new();
894 /// let result = subgraph.from_db(&mut interface, &query);
895 /// assert!(result.is_ok());
896 ///
897 /// // The subgraph should be centered around 1 bp node 14 of degree 4.
898 /// assert_eq!(subgraph.nodes(), 5);
899 /// assert_eq!(subgraph.paths(), 3);
900 ///
901 /// // Clean up.
902 /// drop(interface);
903 /// drop(database);
904 /// fs::remove_file(&db_file).unwrap();
905 /// ```
906 pub fn from_db<'reference, 'graph>(&mut self, graph: &'reference mut GraphInterface<'graph>, query: &SubgraphQuery) -> Result<(), String> {
907 match query.query_type() {
908 QueryType::PathOffset(query_pos) => {
909 let reference_path = self.path_pos_from_db(graph, query_pos)?;
910 self.around_position(GraphReference::Db(graph), reference_path.0.graph_pos(), query.context())?;
911 if query.snarls() {
912 self.extract_snarls(GraphReference::Db(graph), None)?;
913 }
914 self.extract_paths(Some(reference_path), query.output())?;
915 },
916 QueryType::PathInterval(query_pos, len) => {
917 let reference_path = self.path_pos_from_db(graph, query_pos)?;
918 self.around_interval(GraphReference::Db(graph), reference_path.0, *len, query.context())?;
919 if query.snarls() {
920 self.extract_snarls(GraphReference::Db(graph), None)?;
921 }
922 self.extract_paths(Some(reference_path), query.output())?;
923 },
924 QueryType::Nodes(nodes) => {
925 if query.output() == HaplotypeOutput::ReferenceOnly {
926 return Err(String::from("Cannot output a reference path in a node-based query"));
927 }
928 self.around_nodes(GraphReference::Db(graph), nodes, query.context())?;
929 if query.snarls() {
930 self.extract_snarls(GraphReference::Db(graph), None)?;
931 }
932 self.extract_paths(None, query.output())?;
933 },
934 QueryType::Between((start, end), limit) => {
935 if query.output() == HaplotypeOutput::ReferenceOnly {
936 return Err(String::from("Cannot output a reference path in a node-based query"));
937 }
938 self.between_nodes(GraphReference::Db(graph), *start, *end, *limit)?;
939 self.extract_paths(None, query.output())?;
940 },
941 }
942
943 // Determine the stable graph name and relationships.
944 let parent = graph.graph_name()?;
945 self.compute_name(Some(&parent));
946
947 Ok(())
948 }
949
950 // Returns the successor position for the given GBWT position, if it is in the subgraph.
951 fn next_pos(pos: Pos, successors: &BTreeMap<usize, Vec<(Pos, bool)>>) -> Option<Pos> {
952 if let Some(v) = successors.get(&pos.node) {
953 let (next, _) = v[pos.offset];
954 if next.node == ENDMARKER || !successors.contains_key(&next.node) {
955 None
956 } else {
957 Some(next)
958 }
959 } else {
960 None
961 }
962 }
963
964 /// Extracts all paths in the subgraph.
965 ///
966 /// Clears the current paths in the subgraph.
967 /// If a reference path is given, one of the paths is assumed to visit the position.
968 /// This will then set all information related to the reference path.
969 ///
970 /// # Arguments
971 ///
972 /// * `reference_path`: Position on the reference path and the name of the path.
973 /// * `output`: How to output the haplotypes.
974 ///
975 /// # Errors
976 ///
977 /// Returns an error if no path visits the reference position.
978 /// Returns an error if reference-only output is requested without a reference path.
979 /// Clears all path information in case of an error.
980 ///
981 /// # Examples
982 ///
983 /// ```
984 /// use gbz_base::{GBZRecord, Subgraph, HaplotypeOutput};
985 /// use gbz::GBZ;
986 /// use gbz::support;
987 /// use simple_sds::serialize;
988 ///
989 /// // Get the graph.
990 /// let gbz_file = support::get_test_data("example.gbz");
991 /// let graph: GBZ = serialize::load_from(&gbz_file).unwrap();
992 ///
993 /// // Start with an empty subgraph and add a node.
994 /// let mut subgraph = Subgraph::new();
995 /// let result = subgraph.add_node_from_gbz(&graph,15);
996 /// assert!(result.is_ok());
997 ///
998 /// // There are no paths until we extract them.
999 /// assert_eq!(subgraph.paths(), 0);
1000 /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1001 /// assert!(result.is_ok());
1002 /// assert_eq!(subgraph.paths(), 2);
1003 ///
1004 /// // When we change the subgraph, we need to extract the paths again.
1005 /// for node_id in [13, 14] {
1006 /// let result = subgraph.add_node_from_gbz(&graph, node_id);
1007 /// assert!(result.is_ok());
1008 /// }
1009 /// assert_eq!(subgraph.paths(), 0);
1010 /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1011 /// assert!(result.is_ok());
1012 /// assert_eq!(subgraph.paths(), 3);
1013 ///
1014 /// // The same is true for removing nodes.
1015 /// for node_id in [14, 15] {
1016 /// subgraph.remove_node(node_id);
1017 /// }
1018 /// assert_eq!(subgraph.paths(), 0);
1019 /// let result = subgraph.extract_paths(None, HaplotypeOutput::All);
1020 /// assert!(result.is_ok());
1021 /// assert_eq!(subgraph.paths(), 1);
1022 /// ```
1023 pub fn extract_paths(
1024 &mut self,
1025 reference_path: Option<(PathPosition, FullPathName)>,
1026 output: HaplotypeOutput
1027 ) -> Result<(), String> {
1028 self.clear_paths();
1029
1030 let ref_pos;
1031 if let Some((position, name)) = reference_path {
1032 ref_pos = Some(position);
1033 self.ref_path = Some(name);
1034 } else {
1035 ref_pos = None;
1036 }
1037
1038 // Decompress the GBWT node records for the subgraph.
1039 let mut keys: Vec<usize> = Vec::new();
1040 let mut successors: BTreeMap<usize, Vec<(Pos, bool)>> = BTreeMap::new();
1041 for (handle, gbz_record) in self.records.iter() {
1042 let gbwt_record = gbz_record.to_gbwt_record();
1043 let decompressed: Vec<(Pos, bool)> = gbwt_record.decompress().into_iter().map(|x| (x, false)).collect();
1044 keys.push(*handle);
1045 successors.insert(*handle, decompressed);
1046 }
1047
1048 // Mark the positions that have predecessors in the subgraph.
1049 for handle in keys.iter() {
1050 let decompressed = successors.get(handle).unwrap().clone();
1051 for (pos, _) in decompressed.iter() {
1052 if let Some(v) = successors.get_mut(&pos.node) {
1053 v[pos.offset].1 = true;
1054 }
1055 }
1056 }
1057
1058 // TODO: Check for infinite loops.
1059 // Extract all paths and note if one of them passes through `ref_pos`.
1060 // `ref_offset` is the offset of the node containing `ref_pos`.
1061 let mut ref_offset: Option<usize> = None;
1062 for (handle, positions) in successors.iter() {
1063 for (offset, (_, has_predecessor)) in positions.iter().enumerate() {
1064 if *has_predecessor {
1065 continue;
1066 }
1067 let mut curr = Some(Pos::new(*handle, offset));
1068 let mut is_ref = false;
1069 let mut path: Vec<usize> = Vec::new();
1070 let mut len = 0;
1071 while let Some(pos) = curr {
1072 if let Some(position) = ref_pos.as_ref() && pos == position.gbwt_pos() {
1073 self.ref_id = Some(self.paths.len());
1074 ref_offset = Some(path.len());
1075 is_ref = true;
1076 }
1077 path.push(pos.node);
1078 len += self.records.get(&pos.node).unwrap().sequence_len();
1079 curr = Self::next_pos(pos, &successors);
1080 }
1081 if is_ref {
1082 if !support::encoded_path_is_canonical(&path) {
1083 eprintln!("Warning: the reference path is not in canonical orientation");
1084 }
1085 self.paths.push(PathInfo::new(path, len));
1086 } else if support::encoded_path_is_canonical(&path) {
1087 self.paths.push(PathInfo::new(path, len));
1088 }
1089 }
1090 }
1091
1092 // Now we can set the reference interval.
1093 if let Some(ref_pos) = ref_pos {
1094 if let Some(offset) = ref_offset {
1095 let ref_info = &self.paths[self.ref_id.unwrap()];
1096 self.ref_interval = Some(ref_info.path_interval(self, offset, &ref_pos));
1097 } else {
1098 self.clear_paths();
1099 return Err(String::from("Could not find the reference path"));
1100 }
1101 }
1102
1103 // Haplotype output.
1104 if output == HaplotypeOutput::Distinct {
1105 self.distinct_paths();
1106 } else if output == HaplotypeOutput::ReferenceOnly {
1107 self.reference_only()?;
1108 }
1109
1110 Ok(())
1111 }
1112
1113 // Sorts the paths and merges duplicates, storing the count in the weight field.
1114 fn distinct_paths(&mut self) {
1115 let ref_path = self.ref_id.map(|id| self.paths[id].path.clone());
1116 self.paths.sort_unstable();
1117
1118 let mut new_paths: Vec<PathInfo> = Vec::new();
1119 let mut ref_id = None;
1120 for info in self.paths.iter() {
1121 if new_paths.is_empty() || new_paths.last().unwrap().path != info.path {
1122 if let Some(ref_path) = &ref_path && info.path == *ref_path {
1123 ref_id = Some(new_paths.len());
1124 }
1125 new_paths.push(PathInfo::weighted(info.path.clone(), info.len));
1126 } else {
1127 new_paths.last_mut().unwrap().increment_weight();
1128 }
1129 }
1130
1131 self.paths = new_paths;
1132 self.ref_id = ref_id;
1133 }
1134
1135 // Removes all paths except the reference path.
1136 fn reference_only(&mut self) -> Result<(), String> {
1137 if self.ref_id.is_none() {
1138 return Err(String::from("Reference path is required for reference-only output"));
1139 }
1140 let ref_id = self.ref_id.unwrap();
1141 let ref_info = self.paths[ref_id].clone();
1142 self.paths = vec![ref_info];
1143 self.ref_id = Some(0);
1144 Ok(())
1145 }
1146
1147 // Adds a new node to the subgraph.
1148 fn add_node_internal(&mut self, graph: &mut GraphReference<'_, '_>, node_id: usize) -> Result<(), String> {
1149 let forward = graph.gbz_record(support::encode_node(node_id, Orientation::Forward))?;
1150 let reverse = graph.gbz_record(support::encode_node(node_id, Orientation::Reverse))?;
1151 self.records.insert(forward.handle(), forward);
1152 self.records.insert(reverse.handle(), reverse);
1153 Ok(())
1154 }
1155
1156 /// Inserts the given node into the subgraph.
1157 ///
1158 /// No effect if the node is already in the subgraph.
1159 /// Clears all path information from the subgraph.
1160 /// See also [`Self::add_node_from_db`].
1161 ///
1162 /// # Arguments
1163 ///
1164 /// * `graph`: A GBZ graph.
1165 /// * `node_id`: Identifier of the node to insert.
1166 ///
1167 /// # Errors
1168 ///
1169 /// Returns an error if the node does not exist in the graph.
1170 pub fn add_node_from_gbz(&mut self, graph: &GBZ, node_id: usize) -> Result<(), String> {
1171 if self.has_node(node_id) {
1172 return Ok(());
1173 }
1174 self.clear_paths();
1175 self.add_node_internal(&mut GraphReference::Gbz(graph), node_id)
1176 }
1177
1178 /// Inserts the given node into the subgraph.
1179 ///
1180 /// No effect if the node is already in the subgraph.
1181 /// Clears all path information from the subgraph.
1182 /// See also [`Self::add_node_from_gbz`].
1183 ///
1184 /// # Arguments
1185 ///
1186 /// * `graph`: Graph interface.
1187 /// * `node_id`: Identifier of the node to insert.
1188 ///
1189 /// # Errors
1190 ///
1191 /// Returns an error if the node does not exist in the graph.
1192 /// Passes through any errors from the database.
1193 pub fn add_node_from_db(&mut self, graph: &mut GraphInterface, node_id: usize) -> Result<(), String> {
1194 if self.has_node(node_id) {
1195 return Ok(());
1196 }
1197 self.clear_paths();
1198 self.add_node_internal(&mut GraphReference::Db(graph), node_id)
1199 }
1200
1201 // Removes the given node from the subgraph.
1202 fn remove_node_internal(&mut self, node_id: usize) {
1203 self.records.remove(&support::encode_node(node_id, Orientation::Forward));
1204 self.records.remove(&support::encode_node(node_id, Orientation::Reverse));
1205 }
1206
1207 /// Removes the given node from the subgraph.
1208 ///
1209 /// No effect if the node is not in the subgraph.
1210 /// Clears all path information from the subgraph.
1211 pub fn remove_node(&mut self, node_id: usize) {
1212 if !self.has_node(node_id) {
1213 return;
1214 }
1215 self.clear_paths();
1216 self.remove_node_internal(node_id);
1217 }
1218
1219 // Clears all path information from the subgraph.
1220 fn clear_paths(&mut self) {
1221 self.paths.clear();
1222 self.ref_id = None;
1223 self.ref_path = None;
1224 self.ref_interval = None;
1225 }
1226}
1227
1228//-----------------------------------------------------------------------------
1229
1230/// Node/edge operations similar to [`GBZ`].
1231impl Subgraph {
1232 /// Returns the number of nodes in the subgraph.
1233 #[inline]
1234 pub fn nodes(&self) -> usize {
1235 self.records.len() / 2
1236 }
1237
1238 /// Returns the smallest node identifier in the subgraph.
1239 #[inline]
1240 pub fn min_node(&self) -> Option<usize> {
1241 self.records.keys().next().map(|&handle| support::node_id(handle))
1242 }
1243
1244 /// Returns the largest node identifier in the subgraph.
1245 #[inline]
1246 pub fn max_node(&self) -> Option<usize> {
1247 self.records.keys().next_back().map(|&handle| support::node_id(handle))
1248 }
1249
1250 /// Returns the smallest handle in the subgraph.
1251 #[inline]
1252 pub fn min_handle(&self) -> Option<usize> {
1253 self.records.keys().next().copied()
1254 }
1255
1256 /// Returns the largest handle in the subgraph.
1257 #[inline]
1258 pub fn max_handle(&self) -> Option<usize> {
1259 self.records.keys().next_back().copied()
1260 }
1261
1262 /// Returns `true` if the subgraph contains the given node.
1263 #[inline]
1264 pub fn has_node(&self, node_id: usize) -> bool {
1265 self.records.contains_key(&support::encode_node(node_id, Orientation::Forward))
1266 }
1267
1268 // Returns `true` if the subgraph contains the given handle.
1269 #[inline]
1270 pub fn has_handle(&self, handle: usize) -> bool {
1271 self.records.contains_key(&handle)
1272 }
1273
1274 /// Returns the sequence for the handle in the subgraph, or [`None`] if there is no such handle.
1275 #[inline]
1276 pub fn sequence_for_handle(&self, handle: usize) -> Option<&[u8]> {
1277 self.records.get(&handle).map(|record| record.sequence())
1278 }
1279
1280 /// Returns the sequence for the node in the subgraph, or [`None`] if there is no such node.
1281 #[inline]
1282 pub fn sequence(&self, node_id: usize) -> Option<&[u8]> {
1283 self.sequence_for_handle(support::encode_node(node_id, Orientation::Forward))
1284 }
1285
1286 /// Returns the sequence for the node in the given orientation, or [`None`] if there is no such node.
1287 #[inline]
1288 pub fn oriented_sequence(&self, node_id: usize, orientation: Orientation) -> Option<&[u8]> {
1289 self.sequence_for_handle(support::encode_node(node_id, orientation))
1290 }
1291
1292 /// Returns the length of the sequence for the node in the subgraph, or [`None`] if there is no such node.
1293 #[inline]
1294 pub fn sequence_len(&self, node_id: usize) -> Option<usize> {
1295 self.records.get(&support::encode_node(node_id, Orientation::Forward)).map(|record| record.sequence_len())
1296 }
1297
1298 /// Returns an iterator over the node identifiers in the subgraph.
1299 ///
1300 /// The identifiers are listed in ascending order.
1301 pub fn node_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
1302 self.records.keys().step_by(2).map(|&handle| support::node_id(handle))
1303 }
1304
1305 /// Returns an iterator over the handles in the subgraph.
1306 ///
1307 /// The handles are listed in ascending order.
1308 pub fn handle_iter(&'_ self) -> impl Iterator<Item = usize> + use<'_> {
1309 self.records.keys().copied()
1310 }
1311
1312 /// Returns an iterator over the successors of an oriented node, or [`None`] if there is no such node.
1313 pub fn successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1314 let handle = support::encode_node(node_id, orientation);
1315 let record = self.records.get(&handle)?;
1316 Some(EdgeIter::new(self, record, false))
1317 }
1318
1319 /// Returns an iterator over the successors of an oriented node in the supergraph, or [`None`] if there is no such node.
1320 ///
1321 /// This is otherwise the same as [`Self::successors`], but this also lists successors not in the subgraph.
1322 pub fn supergraph_successors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1323 let handle = support::encode_node(node_id, orientation);
1324 let record = self.records.get(&handle)?;
1325 Some(EdgeIter::in_supergraph(self, record, false))
1326 }
1327
1328 /// Returns an iterator over the predecessors of an oriented node, or [`None`] if there is no such node.
1329 pub fn predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1330 let handle = support::encode_node(node_id, orientation.flip());
1331 let record = self.records.get(&handle)?;
1332 Some(EdgeIter::new(self, record, true))
1333 }
1334
1335 /// Returns an iterator over the predecessors of an oriented node in the supergraph, or [`None`] if there is no such node.
1336 ///
1337 /// This is otherwise the same as [`Self::predecessors`], but this also lists predecessors not in the subgraph.
1338 pub fn supergraph_predecessors(&self, node_id: usize, orientation: Orientation) -> Option<EdgeIter<'_>> {
1339 let handle = support::encode_node(node_id, orientation.flip());
1340 let record = self.records.get(&handle)?;
1341 Some(EdgeIter::in_supergraph(self, record, true))
1342 }
1343
1344 // Returns the record for the given node handle, or [`None`] if the node is not in the subgraph.
1345 #[inline]
1346 fn record(&self, handle: usize) -> Option<&GBZRecord> {
1347 self.records.get(&handle)
1348 }
1349
1350 /// Returns the number of paths in the subgraph.
1351 #[inline]
1352 pub fn paths(&self) -> usize {
1353 self.paths.len()
1354 }
1355
1356 /// Returns the top-level snarls covered by the current subgraph.
1357 ///
1358 /// Each snarl is reported as a pair of handles `(start, end)` of its boundary nodes.
1359 /// Here `start` points inside the snarl and `end` points outside the snarl.
1360 /// The snarls are returned in the canonical orientation (see [`support::edge_is_canonical`]).
1361 ///
1362 /// By default, this uses the chain links stored in node records.
1363 /// However, a [`crate::GBZBase`] does not always store top-level chains, and a [`GBZ`] graph does not have that information.
1364 /// In such cases a [`Chains`] object can be provided as an override.
1365 pub fn covered_snarls(&self, chains: Option<&Chains>) -> BTreeSet<(usize, usize)> {
1366 let mut snarls: BTreeSet<(usize, usize)> = BTreeSet::new();
1367 for (&handle, record) in self.records.iter() {
1368 let next = if let Some(chains) = chains {
1369 chains.next(handle)
1370 } else {
1371 record.next()
1372 };
1373 if next.is_none() {
1374 continue;
1375 }
1376 let next = next.unwrap();
1377 if !self.has_handle(next) {
1378 continue;
1379 }
1380 if support::encoded_edge_is_canonical(handle, next) {
1381 snarls.insert((handle, next));
1382 }
1383 }
1384 snarls
1385 }
1386}
1387
1388//-----------------------------------------------------------------------------
1389
1390/// Alignment to the reference.
1391impl Subgraph {
1392 // Returns the total length of the nodes.
1393 fn path_len(&self, path: &[usize]) -> usize {
1394 let mut result = 0;
1395 for handle in path.iter() {
1396 let record = self.records.get(handle).unwrap();
1397 result += record.sequence_len();
1398 }
1399 result
1400 }
1401
1402 // Appends a new edit or extends the previous one. No effect if `len` is zero.
1403 fn append_edit(edits: &mut Vec<(EditOperation, usize)>, op: EditOperation, len: usize) {
1404 if len == 0 {
1405 return;
1406 }
1407 if let Some((prev_op, prev_len)) = edits.last_mut() && *prev_op == op {
1408 *prev_len += len;
1409 return;
1410 }
1411 edits.push((op, len));
1412 }
1413
1414 // Returns the number of matching bases at the start of the two paths.
1415 fn prefix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
1416 let mut result = 0;
1417 let mut path_offset = 0;
1418 let mut ref_offset = 0;
1419 let mut path_base = 0;
1420 let mut ref_base = 0;
1421 while path_offset < path.len() && ref_offset < ref_path.len() {
1422 let a = self.records.get(&path[path_offset]).unwrap().sequence();
1423 let b = self.records.get(&ref_path[ref_offset]).unwrap().sequence();
1424 while path_base < a.len() && ref_base < b.len() {
1425 if a[path_base] != b[ref_base] {
1426 return result;
1427 }
1428 path_base += 1;
1429 ref_base += 1;
1430 result += 1;
1431 }
1432 if path_base == a.len() {
1433 path_offset += 1;
1434 path_base = 0;
1435 }
1436 if ref_base == b.len() {
1437 ref_offset += 1;
1438 ref_base = 0;
1439 }
1440 }
1441 result
1442 }
1443
1444 // Returns the number of matching bases at the end of the two paths.
1445 fn suffix_matches(&self, path: &[usize], ref_path: &[usize]) -> usize {
1446 let mut result = 0;
1447 let mut path_offset = 0;
1448 let mut ref_offset = 0;
1449 let mut path_base = 0;
1450 let mut ref_base = 0;
1451 while path_offset < path.len() && ref_offset < ref_path.len() {
1452 let a = self.records.get(&path[path.len() - path_offset - 1]).unwrap().sequence();
1453 let b = self.records.get(&ref_path[ref_path.len() - ref_offset - 1]).unwrap().sequence();
1454 while path_base < a.len() && ref_base < b.len() {
1455 if a[a.len() - path_base - 1] != b[b.len() - ref_base - 1] {
1456 return result;
1457 }
1458 path_base += 1;
1459 ref_base += 1;
1460 result += 1;
1461 }
1462 if path_base == a.len() {
1463 path_offset += 1;
1464 path_base = 0;
1465 }
1466 if ref_base == b.len() {
1467 ref_offset += 1;
1468 ref_base = 0;
1469 }
1470 }
1471 result
1472 }
1473
1474 // Returns the penalty for a mismatch of the given length.
1475 fn mismatch_penalty(len: usize) -> usize {
1476 4 * len
1477 }
1478
1479 // Returns the penalty for a gap of the given length.
1480 fn gap_penalty(len: usize) -> usize {
1481 if len == 0 {
1482 0
1483 } else {
1484 6 + (len - 1)
1485 }
1486 }
1487
1488 // Appends edits corresponding to the alignment of the given (sub)paths.
1489 // The paths are assumed to be diverging, but there may be base-level matches at the start/end.
1490 // The middle part is either mismatch + gap or insertion + deletion, with gap length possibly 0.
1491 // Alignment scoring follows the parameters used in vg:
1492 // +1 for a match, -4 for a mismatch, -6 for gap open, -1 for gap extend.
1493 //
1494 // NOTE: It is possible that some of the mismatches are actually matches.
1495 // But we ignore this possibility, as the paths are assumed to be diverging.
1496 fn align(&self, path: &[usize], ref_path: &[usize], edits: &mut Vec<(EditOperation, usize)>) {
1497 let path_len = self.path_len(path);
1498 let ref_len = self.path_len(ref_path);
1499 let prefix = self.prefix_matches(path, ref_path);
1500 let mut suffix = self.suffix_matches(path, ref_path);
1501 if prefix + suffix > path_len {
1502 suffix = path_len - prefix;
1503 }
1504 if prefix + suffix > ref_len {
1505 suffix = ref_len - prefix;
1506 }
1507
1508
1509 Self::append_edit(edits, EditOperation::Match, prefix);
1510 let path_middle = path_len - prefix - suffix;
1511 let ref_middle = ref_len - prefix - suffix;
1512 if path_middle == 0 {
1513 Self::append_edit(edits, EditOperation::Deletion, ref_middle);
1514 } else if ref_middle == 0 {
1515 Self::append_edit(edits, EditOperation::Insertion, path_middle);
1516 } else {
1517 let mismatch = cmp::min(path_middle, ref_middle);
1518 let mismatch_indel =
1519 Self::mismatch_penalty(mismatch) +
1520 Self::gap_penalty(path_middle - mismatch) +
1521 Self::gap_penalty(ref_middle - mismatch);
1522 let insertion_deletion = Self::gap_penalty(path_middle) + Self::gap_penalty(ref_middle);
1523 if mismatch_indel <= insertion_deletion {
1524 Self::append_edit(edits, EditOperation::Match, mismatch);
1525 Self::append_edit(edits, EditOperation::Insertion, path_middle - mismatch);
1526 Self::append_edit(edits, EditOperation::Deletion, ref_middle - mismatch);
1527 } else {
1528 Self::append_edit(edits, EditOperation::Insertion, path_middle);
1529 Self::append_edit(edits, EditOperation::Deletion, ref_middle);
1530 }
1531 }
1532 Self::append_edit(edits, EditOperation::Match, suffix);
1533 }
1534
1535 // Returns the CIGAR string for the given path, aligned to the reference path.
1536 // Takes the alignment from the LCS of the paths weighted by node lengths.
1537 // Diverging parts are aligned using `align()`.
1538 fn align_to_ref(&self, path_id: usize) -> Option<String> {
1539 let ref_id = self.ref_id?;
1540 if path_id == ref_id || path_id >= self.paths.len() {
1541 return None;
1542 }
1543
1544 // Find the LCS of the paths weighted by node lengths.
1545 let weight = &|handle: usize| -> usize {
1546 self.records.get(&handle).unwrap().sequence_len()
1547 };
1548 let path = &self.paths[path_id].path;
1549 let ref_path = &self.paths[ref_id].path;
1550 let (lcs, _) = algorithms::fast_weighted_lcs(path, ref_path, weight);
1551
1552 // Convert the LCS to a sequence of edit operations
1553 let mut edits: Vec<(EditOperation, usize)> = Vec::new();
1554 let mut path_offset = 0;
1555 let mut ref_offset = 0;
1556 for (next_path_offset, next_ref_offset) in lcs.iter() {
1557 let path_interval = &path[path_offset..*next_path_offset];
1558 let ref_interval = &ref_path[ref_offset..*next_ref_offset];
1559 self.align(path_interval, ref_interval, &mut edits);
1560 let node_len = self.records.get(&path[*next_path_offset]).unwrap().sequence_len();
1561 Self::append_edit(&mut edits, EditOperation::Match, node_len);
1562 path_offset = next_path_offset + 1;
1563 ref_offset = next_ref_offset + 1;
1564 }
1565 self.align(&path[path_offset..], &ref_path[ref_offset..], &mut edits);
1566
1567 // Convert the edits to a CIGAR string.
1568 let mut result = String::new();
1569 for (op, len) in edits.iter() {
1570 result.push_str(&format!("{}{}", len, op));
1571 }
1572 Some(result)
1573 }
1574}
1575
1576//-----------------------------------------------------------------------------
1577
1578/// Output formats.
1579impl Subgraph {
1580 /// Writes the subgraph in the GFA format to the given output.
1581 ///
1582 /// The output is a full GFA file, including the header.
1583 /// If `cigar` is true, the CIGAR strings for the non-reference haplotypes are included in the output.
1584 pub fn write_gfa<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
1585 // Header.
1586 let reference_samples = self.ref_path.as_ref().map(|path| path.sample.as_ref());
1587 formats::write_gfa_header(reference_samples, output)?;
1588 if let Some(graph_name) = self.graph_name.as_ref() {
1589 let header_lines = graph_name.to_gfa_header_lines();
1590 formats::write_header_lines(&header_lines, output)?;
1591 }
1592
1593 // Segments.
1594 for (handle, record) in self.records.iter() {
1595 if support::node_orientation(*handle) == Orientation::Forward {
1596 formats::write_gfa_node(record.id(), record.sequence(), output)?;
1597 }
1598 }
1599
1600 // Links.
1601 for (handle, record) in self.records.iter() {
1602 let from = support::decode_node(*handle);
1603 for successor in record.successors() {
1604 let to = support::decode_node(successor);
1605 if self.has_handle(successor) && support::edge_is_canonical(from, to) {
1606 formats::write_gfa_link(
1607 (from.0.to_string().as_bytes(), from.1),
1608 (to.0.to_string().as_bytes(), to.1),
1609 output
1610 )?;
1611 }
1612 }
1613 }
1614
1615 // Paths.
1616 if let Some((metadata, ref_id)) = self.ref_metadata() {
1617 formats::write_gfa_walk(&self.paths[ref_id].path, &metadata, output)?;
1618 }
1619 let mut haplotype = 1;
1620 let contig_name = self.contig_name();
1621 for (id, path_info) in self.paths.iter().enumerate() {
1622 if Some(id) == self.ref_id {
1623 continue;
1624 }
1625 let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
1626 metadata.add_weight(path_info.weight);
1627 if cigar {
1628 metadata.add_cigar(self.align_to_ref(id));
1629 }
1630 formats::write_gfa_walk(&path_info.path, &metadata, output)?;
1631 haplotype += 1;
1632 }
1633
1634 Ok(())
1635 }
1636
1637 // TODO: We cannot include graph name, as there is no header information.
1638 /// Writes the subgraph in the JSON format to the given output.
1639 ///
1640 /// If `cigar` is true, the CIGAR strings for the non-reference haplotypes are included in the output.
1641 pub fn write_json<T: Write>(&self, output: &mut T, cigar: bool) -> io::Result<()> {
1642 // Nodes.
1643 let mut nodes: Vec<JSONValue> = Vec::new();
1644 for (_, record) in self.records.iter() {
1645 let (id, orientation) = support::decode_node(record.handle());
1646 if orientation == Orientation::Reverse {
1647 continue;
1648 }
1649 let node = JSONValue::Object(vec![
1650 ("id".to_string(), JSONValue::String(id.to_string())),
1651 ("sequence".to_string(), JSONValue::String(String::from_utf8_lossy(record.sequence()).to_string())),
1652 ]);
1653 nodes.push(node);
1654 }
1655
1656 // Edges.
1657 let mut edges: Vec<JSONValue> = Vec::new();
1658 for (handle, record) in self.records.iter() {
1659 let from = support::decode_node(*handle);
1660 for successor in record.successors() {
1661 let to = support::decode_node(successor);
1662 if self.has_handle(successor) && support::edge_is_canonical(from, to) {
1663 let edge = JSONValue::Object(vec![
1664 ("from".to_string(), JSONValue::String(from.0.to_string())),
1665 ("from_is_reverse".to_string(), JSONValue::Boolean(from.1 == Orientation::Reverse)),
1666 ("to".to_string(), JSONValue::String(to.0.to_string())),
1667 ("to_is_reverse".to_string(), JSONValue::Boolean(to.1 == Orientation::Reverse)),
1668 ]);
1669 edges.push(edge);
1670 }
1671 }
1672 }
1673
1674 // Paths.
1675 let mut paths: Vec<JSONValue> = Vec::new();
1676 if let Some((metadata, ref_id)) = self.ref_metadata() {
1677 let ref_path = formats::json_path(&self.paths[ref_id].path, &metadata);
1678 paths.push(ref_path);
1679 }
1680 let mut haplotype = 1;
1681 let contig_name = self.contig_name();
1682 for (id, path_info) in self.paths.iter().enumerate() {
1683 if Some(id) == self.ref_id {
1684 continue;
1685 }
1686 let mut metadata = WalkMetadata::anonymous(haplotype, &contig_name, path_info.len);
1687 metadata.add_weight(path_info.weight);
1688 if cigar {
1689 metadata.add_cigar(self.align_to_ref(id));
1690 }
1691 let path = formats::json_path(&path_info.path, &metadata);
1692 paths.push(path);
1693 haplotype += 1;
1694 }
1695
1696 let result = JSONValue::Object(vec![
1697 ("nodes".to_string(), JSONValue::Array(nodes)),
1698 ("edges".to_string(), JSONValue::Array(edges)),
1699 ("paths".to_string(), JSONValue::Array(paths)),
1700 ]);
1701 output.write_all(result.to_string().as_bytes())?;
1702
1703 Ok(())
1704 }
1705
1706 // Builds metadata for the reference path.
1707 fn ref_metadata(&self) -> Option<(WalkMetadata, usize)> {
1708 let ref_id = self.ref_id?;
1709 let ref_path = self.ref_path.as_ref()?;
1710 let interval = self.ref_interval.as_ref()?.clone();
1711 let mut metadata = WalkMetadata::path_interval(ref_path, interval);
1712 metadata.add_weight(self.paths[ref_id].weight);
1713 Some((metadata, ref_id))
1714 }
1715
1716 // Determines a contig name for the subgraph.
1717 fn contig_name(&self) -> String {
1718 if let Some(ref_path) = self.ref_path.as_ref() {
1719 ref_path.contig.clone()
1720 } else {
1721 String::from("unknown")
1722 }
1723 }
1724}
1725
1726//-----------------------------------------------------------------------------
1727
1728/// Graph names.
1729impl Subgraph {
1730 /// Returns the stable graph name (pggname) for the subgraph.
1731 pub fn graph_name(&self) -> Option<&GraphName> {
1732 self.graph_name.as_ref()
1733 }
1734
1735 /// Returns `true` if the subgraph has a graph name.
1736 pub fn has_graph_name(&self) -> bool {
1737 self.graph_name.is_some()
1738 }
1739
1740 /// Computes and stores the stable graph name (pggname) for the subgraph.
1741 ///
1742 /// If the name of the parent graph is given, this graph will be marked as its subgraph.
1743 /// All other graph relationships are also copied from the parent.
1744 /// No effect if the subgraph already has a graph name.
1745 pub fn compute_name(&mut self, parent: Option<&GraphName>) {
1746 if self.has_graph_name() {
1747 return;
1748 }
1749 let name = pggname::stable_name(self);
1750 let mut result = GraphName::new(name);
1751 if let Some(parent) = parent {
1752 result.make_subgraph_of(parent);
1753 }
1754 self.graph_name = Some(result);
1755 }
1756}
1757
1758impl Graph for Subgraph {
1759 fn new() -> Self {
1760 unimplemented!();
1761 }
1762
1763 fn add_node(&mut self, _: &[u8], _: &[u8]) -> Result<(), String> {
1764 unimplemented!();
1765 }
1766
1767 fn add_edge(&mut self, _: &[u8], _: Orientation, _: &[u8], _: Orientation) -> Result<(), String> {
1768 unimplemented!();
1769 }
1770
1771 fn finalize(&mut self) -> Result<(), String> {
1772 Ok(())
1773 }
1774
1775 fn statistics(&self) -> (usize, usize, usize) {
1776 let node_count = self.nodes();
1777 let mut edge_count = 0;
1778 let mut seq_len = 0;
1779 for (handle, record) in self.records.iter() {
1780 let (from_id, from_o) = support::decode_node(*handle);
1781 if from_o == Orientation::Forward {
1782 seq_len += record.sequence_len();
1783 }
1784 for successor in record.successors() {
1785 let (to_id, to_o) = support::decode_node(successor);
1786 if self.has_node(to_id) && support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
1787 edge_count += 1;
1788 }
1789 }
1790 }
1791 (node_count, edge_count, seq_len)
1792 }
1793
1794 fn node_iter(&self) -> impl Iterator<Item=Vec<u8>> {
1795 self.node_iter().map(|from_id| {
1796 let sequence = self.sequence(from_id).unwrap().to_vec();
1797 let mut node = NodeInt::new(Some(sequence));
1798 for from_o in [Orientation::Forward, Orientation::Reverse] {
1799 for (to_id, to_o) in self.successors(from_id, from_o).unwrap() {
1800 if support::edge_is_canonical((from_id, from_o), (to_id, to_o)) {
1801 node.edges.push((from_o, to_id, to_o));
1802 }
1803 }
1804 }
1805 node.finalize();
1806 node.serialize(from_id)
1807 })
1808 }
1809}
1810
1811//-----------------------------------------------------------------------------
1812
1813/// A path position represented in multiple ways.
1814#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1815pub struct PathPosition {
1816 // Sequence offset from the start of the path.
1817 seq_offset: usize,
1818 // GBWT node identifier encoding an oriented node.
1819 gbwt_node: usize,
1820 // Offset from the start of the node.
1821 node_offset: usize,
1822 // Offset within GBWT record.
1823 gbwt_offset: usize,
1824}
1825
1826impl PathPosition {
1827 /// Returns the sequence offset in bp from the start of the path.
1828 #[inline]
1829 pub fn seq_offset(&self) -> usize {
1830 self.seq_offset
1831 }
1832
1833 /// Returns the node identifier.
1834 #[inline]
1835 pub fn node_id(&self) -> usize {
1836 support::node_id(self.gbwt_node)
1837 }
1838
1839 /// Returns the orientation of the node.
1840 #[inline]
1841 pub fn orientation(&self) -> Orientation {
1842 support::node_orientation(self.gbwt_node)
1843 }
1844
1845 /// Returns the offset from the start of the node.
1846 #[inline]
1847 pub fn node_offset(&self) -> usize {
1848 self.node_offset
1849 }
1850
1851 /// Returns the graph position.
1852 #[inline]
1853 pub fn graph_pos(&self) -> GraphPosition {
1854 GraphPosition {
1855 node: self.node_id(),
1856 orientation: self.orientation(),
1857 offset: self.node_offset(),
1858 }
1859 }
1860
1861 /// Returns the GBWT node identifier / handle.
1862 #[inline]
1863 pub fn handle(&self) -> usize {
1864 self.gbwt_node
1865 }
1866
1867 /// Returns the offset within the GBWT record.
1868 #[inline]
1869 pub fn gbwt_offset(&self) -> usize {
1870 self.gbwt_offset
1871 }
1872
1873 /// Returns the GBWT position for the oriented node containing the position.
1874 #[inline]
1875 pub fn gbwt_pos(&self) -> Pos {
1876 Pos {
1877 node: self.handle(),
1878 offset: self.gbwt_offset(),
1879 }
1880 }
1881}
1882
1883// TODO: Add hash of the path for faster comparisons?
1884#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
1885struct PathInfo {
1886 path: Vec<usize>,
1887 len: usize,
1888 weight: Option<usize>,
1889}
1890
1891impl PathInfo {
1892 fn new(path: Vec<usize>, len: usize) -> Self {
1893 PathInfo { path, len, weight: None }
1894 }
1895
1896 fn weighted(path: Vec<usize>, len: usize) -> Self {
1897 PathInfo { path, len, weight: Some(1) }
1898 }
1899
1900 fn increment_weight(&mut self) {
1901 if let Some(weight) = self.weight {
1902 self.weight = Some(weight + 1);
1903 }
1904 }
1905
1906 // Returns the sequence interval (in bp) for this path, assuming that `path[path_offset]` matches `path_pos`.
1907 // The sequence interval is for the part of the path contained in the subgraph.
1908 fn path_interval(&self, subgraph: &Subgraph, path_offset: usize, path_pos: &PathPosition) -> Range<usize> {
1909 // Distance from the start of the path to `path_pos`.
1910 let mut ref_pos = path_pos.node_offset();
1911 for handle in self.path.iter().take(path_offset) {
1912 ref_pos += subgraph.records.get(handle).unwrap().sequence_len();
1913 }
1914
1915 let start = path_pos.seq_offset() - ref_pos;
1916 let end = start + self.len;
1917 start..end
1918 }
1919}
1920
1921#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
1922enum EditOperation {
1923 Match,
1924 Insertion,
1925 Deletion,
1926}
1927
1928impl Display for EditOperation {
1929 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1930 match self {
1931 EditOperation::Match => write!(f, "M"),
1932 EditOperation::Insertion => write!(f, "I"),
1933 EditOperation::Deletion => write!(f, "D"),
1934 }
1935 }
1936}
1937
1938//-----------------------------------------------------------------------------
1939
1940/// An iterator over the predecessors or successors of a node.
1941///
1942/// The type of `Item` is `(`[`usize`]`, `[`Orientation`]`)`.
1943/// These values encode the identifier and the orientation of the node.
1944/// Successor nodes are always listed in sorted order.
1945/// Predecessor nodes are sorted by their identifiers, but the reverse orientation is listed before the forward orientation.
1946/// Like [`gbz::gbz::EdgeIter`], but only for edges in the subgraph.
1947#[derive(Clone, Debug)]
1948pub struct EdgeIter<'a> {
1949 parent: &'a Subgraph,
1950 // Successors of the record, excluding the possible endmarker.
1951 successors: &'a [Pos],
1952 // The first outrank that has not been visited.
1953 next: usize,
1954 // The first outrank that we should not visit.
1955 limit: usize,
1956 // Flip the orientation in the iterated values.
1957 flip: bool,
1958 // Iterate over the edges in the supergraph, not the subgraph.
1959 supergraph: bool,
1960}
1961
1962
1963impl<'a> EdgeIter<'a> {
1964 /// Creates a new iterator over the successors of the record.
1965 ///
1966 /// If `flip` is `true`, the iterator will flip the orientation of the successors.
1967 /// This is effectively the same as listing the predecessors of the reverse orientation of the record.
1968 pub fn new(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
1969 let successors = record.edges_slice();
1970 let limit = successors.len();
1971 EdgeIter {
1972 parent,
1973 successors,
1974 next: 0,
1975 limit,
1976 flip,
1977 supergraph: false,
1978 }
1979 }
1980
1981 /// Creates a new iterator over the successors of the record in the supergraph.
1982 ///
1983 /// This is otherwise the same as [`Self::new`], but this also lists successors not in the subgraph.
1984 pub fn in_supergraph(parent: &'a Subgraph, record: &'a GBZRecord, flip: bool) -> Self {
1985 let successors = record.edges_slice();
1986 let limit = successors.len();
1987 EdgeIter {
1988 parent,
1989 successors,
1990 next: 0,
1991 limit,
1992 flip,
1993 supergraph: true,
1994 }
1995 }
1996}
1997
1998impl<'a> Iterator for EdgeIter<'a> {
1999 type Item = (usize, Orientation);
2000
2001 fn next(&mut self) -> Option<Self::Item> {
2002 while self.next < self.limit {
2003 let handle = self.successors[self.next].node;
2004 self.next += 1;
2005 if self.supergraph || self.parent.has_handle(handle) {
2006 let node_id = support::node_id(handle);
2007 let orientation = if self.flip {
2008 support::node_orientation(handle).flip()
2009 } else {
2010 support::node_orientation(handle)
2011 };
2012 return Some((node_id, orientation));
2013 }
2014 }
2015 None
2016 }
2017}
2018
2019impl<'a> DoubleEndedIterator for EdgeIter<'a> {
2020 fn next_back(&mut self) -> Option<Self::Item> {
2021 while self.next < self.limit {
2022 let handle = self.successors[self.limit - 1].node;
2023 self.limit -= 1;
2024 if self.supergraph || self.parent.has_handle(handle) {
2025 let node_id = support::node_id(handle);
2026 let orientation = if self.flip {
2027 support::node_orientation(handle).flip()
2028 } else {
2029 support::node_orientation(handle)
2030 };
2031 return Some((node_id, orientation));
2032 }
2033 }
2034 None
2035 }
2036}
2037
2038impl<'a> FusedIterator for EdgeIter<'a> {}
2039
2040//-----------------------------------------------------------------------------