gbz_base/subgraph/query.rs
1//! Queries for extracting a subgraph from GBZ-base or a GBZ graph.
2
3use gbz::{support, FullPathName};
4
5use std::collections::BTreeSet;
6use std::fmt::Display;
7use std::ops::Range;
8
9//-----------------------------------------------------------------------------
10
11/// Output options for the haplotypes in the subgraph.
12#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
13pub enum HaplotypeOutput {
14 /// Output all haplotypes as separate paths.
15 All,
16 /// Output only distinct haplotypes with the number of duplicates stored in the weight field.
17 Distinct,
18 /// Output only the reference path.
19 ReferenceOnly,
20}
21
22impl Display for HaplotypeOutput {
23 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
24 match self {
25 HaplotypeOutput::All => write!(f, "all"),
26 HaplotypeOutput::Distinct => write!(f, "distinct"),
27 HaplotypeOutput::ReferenceOnly => write!(f, "reference only"),
28 }
29 }
30}
31
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub(super) enum QueryType {
34 // Path name and offset in bp stored in the fragment field.
35 PathOffset(FullPathName),
36 // Starting position as in `PathOffset` and length in bp.
37 PathInterval(FullPathName, usize),
38 // Set of node identifiers.
39 Nodes(BTreeSet<usize>),
40 // Subgraph between two handles in the same chain, with an optional safety limit for the number of nodes extracted.
41 Between((usize, usize), Option<usize>),
42}
43
44//-----------------------------------------------------------------------------
45
46/// Arguments for extracting a subgraph.
47///
48/// # Examples
49///
50/// ```
51/// use gbz_base::SubgraphQuery;
52/// use gbz::FullPathName;
53///
54/// let path_name = FullPathName::generic("path");
55/// let query = SubgraphQuery::path_offset(&path_name, 123);
56/// assert_eq!(query.context(), SubgraphQuery::DEFAULT_CONTEXT);
57/// assert_eq!(query.snarls(), SubgraphQuery::DEFAULT_SNARLS);
58/// assert_eq!(query.output(), SubgraphQuery::DEFAULT_OUTPUT);
59///
60/// let query = query.with_context(1000);
61/// assert_eq!(query.context(), 1000);
62///
63/// let query = query.with_snarls(true);
64/// assert!(query.snarls());
65/// ```
66#[derive(Clone, Debug, PartialEq, Eq)]
67pub struct SubgraphQuery {
68 query_type: QueryType,
69
70 // Context size around the reference position (in bp).
71 context: usize,
72
73 // Also extract nodes in covered top-level snarls.
74 snarls: bool,
75
76 // How to output the haplotypes.
77 output: HaplotypeOutput,
78}
79
80impl SubgraphQuery {
81 /// Default value for context length (in bp).
82 pub const DEFAULT_CONTEXT: usize = 100;
83
84 /// Default value for the snarl extraction flag.
85 pub const DEFAULT_SNARLS: bool = false;
86
87 /// Default value for the haplotype output option.
88 pub const DEFAULT_OUTPUT: HaplotypeOutput = HaplotypeOutput::All;
89
90 /// Creates a query that retrieves a subgraph around a path offset.
91 ///
92 /// The reference path should be specified by using a sample name, a contig name, and optionally a haplotype number.
93 /// The fragment field should not be used.
94 /// If the reference haplotype is fragmented, the query will try to find the right fragment.
95 pub fn path_offset(path_name: &FullPathName, offset: usize) -> Self {
96 let mut path_name = path_name.clone();
97 path_name.fragment = offset;
98 SubgraphQuery {
99 query_type: QueryType::PathOffset(path_name),
100 context: Self::DEFAULT_CONTEXT,
101 snarls: Self::DEFAULT_SNARLS,
102 output: Self::DEFAULT_OUTPUT,
103 }
104 }
105
106 /// Cretes a query that retrieves a subgraph around a path interval.
107 ///
108 /// The reference path should be specified by using a sample name, a contig name, and optionally a haplotype number.
109 /// The fragment field should not be used.
110 /// If the reference haplotype is fragmented, the query will try to find the right fragment.
111 pub fn path_interval(path_name: &FullPathName, interval: Range<usize>) -> Self {
112 let mut path_name = path_name.clone();
113 path_name.fragment = interval.start;
114 SubgraphQuery {
115 query_type: QueryType::PathInterval(path_name, interval.len()),
116 context: Self::DEFAULT_CONTEXT,
117 snarls: Self::DEFAULT_SNARLS,
118 output: Self::DEFAULT_OUTPUT,
119 }
120 }
121
122 /// Creates a query that retrieves a subgraph around a set of nodes.
123 pub fn nodes(nodes: impl IntoIterator<Item = usize>) -> Self {
124 SubgraphQuery {
125 query_type: QueryType::Nodes(nodes.into_iter().collect()),
126 context: Self::DEFAULT_CONTEXT,
127 snarls: Self::DEFAULT_SNARLS,
128 output: Self::DEFAULT_OUTPUT,
129 }
130 }
131
132 /// Creates a query that extracts a subgraph between two handles in the same chain.
133 ///
134 /// This query ignores context length and the snarl extraction flag.
135 /// An optional safety limit for the size of the subgraph in nodes can be provided.
136 /// If the nodes are not in the same chain in the given order, the subgraph can otherwise be arbitrarily large.
137 pub fn between(start: usize, end: usize, limit: Option<usize>) -> Self {
138 SubgraphQuery {
139 query_type: QueryType::Between((start, end), limit),
140 context: Self::DEFAULT_CONTEXT,
141 snarls: Self::DEFAULT_SNARLS,
142 output: Self::DEFAULT_OUTPUT,
143 }
144 }
145
146 /// Returns an updated query with the given context length.
147 ///
148 /// See [`Self::DEFAULT_CONTEXT`] for the default value.
149 pub fn with_context(self, context: usize) -> Self {
150 SubgraphQuery { context, ..self }
151 }
152
153 /// Returns an updated query with the given snarl extraction flag.
154 ///
155 /// See [`Self::DEFAULT_SNARLS`] for the default value.
156 pub fn with_snarls(self, snarls: bool) -> Self {
157 SubgraphQuery { snarls, ..self }
158 }
159
160 /// Returns an updated query with the given haplotype output option.
161 ///
162 /// See [`Self::DEFAULT_OUTPUT`] for the default value.
163 ///
164 /// # Panics
165 ///
166 /// Panics if this is a node-based query and the output would be [`HaplotypeOutput::ReferenceOnly`].
167 pub fn with_output(self, output: HaplotypeOutput) -> Self {
168 if let QueryType::Nodes(_) = self.query_type {
169 assert!(output != HaplotypeOutput::ReferenceOnly, "Reference-only output is not supported for node-based queries");
170 }
171 SubgraphQuery { output, ..self }
172 }
173
174 pub(super) fn query_type(&self) -> &QueryType {
175 &self.query_type
176 }
177
178 /// Returns the context length (in bp) for the query.
179 pub fn context(&self) -> usize {
180 self.context
181 }
182
183 /// Returns `true` if the query also extracts nodes coverered top-level snarls.
184 ///
185 /// A snarl is covered, if both of its boundary nodes are contained in the query interval or in the context.
186 pub fn snarls(&self) -> bool {
187 self.snarls
188 }
189
190 /// Returns the output format for the query.
191 pub fn output(&self) -> HaplotypeOutput {
192 self.output
193 }
194}
195
196impl Display for SubgraphQuery {
197 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
198 let context_str = if self.snarls() {
199 format!("{} with snarls", self.context)
200 } else {
201 format!("{}", self.context)
202 };
203 match self.query_type() {
204 QueryType::PathOffset(path_name) => write!(f, "(path {}, context {}, {})", path_name, context_str, self.output),
205 QueryType::PathInterval(path_name, len) => write!(f, "(path {}, len {}, context {}, {})", path_name, len, context_str, self.output),
206 QueryType::Nodes(nodes) => write!(f, "(nodes {:#?}, context {}, {})", nodes, context_str, self.output),
207 QueryType::Between((start, end), limit) => {
208 let (start_id, start_o) = support::decode_node(*start);
209 let (end_id, end_o) = support::decode_node(*end);
210 let limit_str = if let Some(limit) = limit {
211 format!(", limit {}", limit)
212 } else {
213 String::new()
214 };
215 write!(f, "(between ({} {}) and ({} {}){}, {})", start_id, start_o, end_id, end_o, limit_str, self.output)
216 },
217 }
218 }
219}
220
221//-----------------------------------------------------------------------------