Skip to main content

gbz_base/subgraph/
query.rs

1//! Queries for extracting a subgraph from GBZ-base or a GBZ graph.
2
3use gbz::{support, FullPathName};
4
5use std::collections::BTreeSet;
6use std::fmt::Display;
7use std::ops::Range;
8
9//-----------------------------------------------------------------------------
10
11/// Output options for the haplotypes in the subgraph.
12#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
13pub enum HaplotypeOutput {
14    /// Output all haplotypes as separate paths.
15    All,
16    /// Output only distinct haplotypes with the number of duplicates stored in the weight field.
17    Distinct,
18    /// Output only the reference path.
19    ReferenceOnly,
20}
21
22impl Display for HaplotypeOutput {
23    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
24        match self {
25            HaplotypeOutput::All => write!(f, "all"),
26            HaplotypeOutput::Distinct => write!(f, "distinct"),
27            HaplotypeOutput::ReferenceOnly => write!(f, "reference only"),
28        }
29    }
30}
31
32#[derive(Clone, Debug, PartialEq, Eq)]
33pub(super) enum QueryType {
34    // Path name and offset in bp stored in the fragment field.
35    PathOffset(FullPathName),
36    // Starting position as in `PathOffset` and length in bp.
37    PathInterval(FullPathName, usize),
38    // Set of node identifiers.
39    Nodes(BTreeSet<usize>),
40    // Subgraph between two handles in the same chain, with an optional safety limit for the number of nodes extracted.
41    Between((usize, usize), Option<usize>),
42}
43
44//-----------------------------------------------------------------------------
45
46/// Arguments for extracting a subgraph.
47///
48/// # Examples
49///
50/// ```
51/// use gbz_base::SubgraphQuery;
52/// use gbz::FullPathName;
53///
54/// let path_name = FullPathName::generic("path");
55/// let query = SubgraphQuery::path_offset(&path_name, 123);
56/// assert_eq!(query.context(), SubgraphQuery::DEFAULT_CONTEXT);
57/// assert_eq!(query.snarls(), SubgraphQuery::DEFAULT_SNARLS);
58/// assert_eq!(query.output(), SubgraphQuery::DEFAULT_OUTPUT);
59///
60/// let query = query.with_context(1000);
61/// assert_eq!(query.context(), 1000);
62///
63/// let query = query.with_snarls(true);
64/// assert!(query.snarls());
65/// ```
66#[derive(Clone, Debug, PartialEq, Eq)]
67pub struct SubgraphQuery {
68    query_type: QueryType,
69
70    // Context size around the reference position (in bp).
71    context: usize,
72
73    // Also extract nodes in covered top-level snarls.
74    snarls: bool,
75
76    // How to output the haplotypes.
77    output: HaplotypeOutput,
78}
79
80impl SubgraphQuery {
81    /// Default value for context length (in bp).
82    pub const DEFAULT_CONTEXT: usize = 100;
83
84    /// Default value for the snarl extraction flag.
85    pub const DEFAULT_SNARLS: bool = false;
86
87    /// Default value for the haplotype output option.
88    pub const DEFAULT_OUTPUT: HaplotypeOutput = HaplotypeOutput::All;
89
90    /// Creates a query that retrieves a subgraph around a path offset.
91    ///
92    /// The reference path should be specified by using a sample name, a contig name, and optionally a haplotype number.
93    /// The fragment field should not be used.
94    /// If the reference haplotype is fragmented, the query will try to find the right fragment.
95    pub fn path_offset(path_name: &FullPathName, offset: usize) -> Self {
96        let mut path_name = path_name.clone();
97        path_name.fragment = offset;
98        SubgraphQuery {
99            query_type: QueryType::PathOffset(path_name),
100            context: Self::DEFAULT_CONTEXT,
101            snarls: Self::DEFAULT_SNARLS,
102            output: Self::DEFAULT_OUTPUT,
103        }
104    }
105
106    /// Cretes a query that retrieves a subgraph around a path interval.
107    ///
108    /// The reference path should be specified by using a sample name, a contig name, and optionally a haplotype number.
109    /// The fragment field should not be used.
110    /// If the reference haplotype is fragmented, the query will try to find the right fragment.
111    pub fn path_interval(path_name: &FullPathName, interval: Range<usize>) -> Self {
112        let mut path_name = path_name.clone();
113        path_name.fragment = interval.start;
114        SubgraphQuery {
115            query_type: QueryType::PathInterval(path_name, interval.len()),
116            context: Self::DEFAULT_CONTEXT,
117            snarls: Self::DEFAULT_SNARLS,
118            output: Self::DEFAULT_OUTPUT,
119        }
120    }
121
122    /// Creates a query that retrieves a subgraph around a set of nodes.
123    pub fn nodes(nodes: impl IntoIterator<Item = usize>) -> Self {
124        SubgraphQuery {
125            query_type: QueryType::Nodes(nodes.into_iter().collect()),
126            context: Self::DEFAULT_CONTEXT,
127            snarls: Self::DEFAULT_SNARLS,
128            output: Self::DEFAULT_OUTPUT,
129        }
130    }
131
132    /// Creates a query that extracts a subgraph between two handles in the same chain.
133    ///
134    /// This query ignores context length and the snarl extraction flag.
135    /// An optional safety limit for the size of the subgraph in nodes can be provided.
136    /// If the nodes are not in the same chain in the given order, the subgraph can otherwise be arbitrarily large.
137    pub fn between(start: usize, end: usize, limit: Option<usize>) -> Self {
138        SubgraphQuery {
139            query_type: QueryType::Between((start, end), limit),
140            context: Self::DEFAULT_CONTEXT,
141            snarls: Self::DEFAULT_SNARLS,
142            output: Self::DEFAULT_OUTPUT,
143        }
144    }
145
146    /// Returns an updated query with the given context length.
147    ///
148    /// See [`Self::DEFAULT_CONTEXT`] for the default value.
149    pub fn with_context(self, context: usize) -> Self {
150        SubgraphQuery { context, ..self }
151    }
152
153    /// Returns an updated query with the given snarl extraction flag.
154    ///
155    /// See [`Self::DEFAULT_SNARLS`] for the default value.
156    pub fn with_snarls(self, snarls: bool) -> Self {
157        SubgraphQuery { snarls, ..self }
158    }
159
160    /// Returns an updated query with the given haplotype output option.
161    ///
162    /// See [`Self::DEFAULT_OUTPUT`] for the default value.
163    ///
164    /// # Panics
165    ///
166    /// Panics if this is a node-based query and the output would be [`HaplotypeOutput::ReferenceOnly`].
167    pub fn with_output(self, output: HaplotypeOutput) -> Self {
168        if let QueryType::Nodes(_) = self.query_type {
169            assert!(output != HaplotypeOutput::ReferenceOnly, "Reference-only output is not supported for node-based queries");
170        }
171        SubgraphQuery { output, ..self }
172    }
173
174    pub(super) fn query_type(&self) -> &QueryType {
175        &self.query_type
176    }
177
178    /// Returns the context length (in bp) for the query.
179    pub fn context(&self) -> usize {
180        self.context
181    }
182
183    /// Returns `true` if the query also extracts nodes coverered top-level snarls.
184    ///
185    /// A snarl is covered, if both of its boundary nodes are contained in the query interval or in the context.
186    pub fn snarls(&self) -> bool {
187        self.snarls
188    }
189
190    /// Returns the output format for the query.
191    pub fn output(&self) -> HaplotypeOutput {
192        self.output
193    }
194}
195
196impl Display for SubgraphQuery {
197    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
198        let context_str = if self.snarls() {
199            format!("{} with snarls", self.context)
200        } else {
201            format!("{}", self.context)
202        };
203        match self.query_type() {
204            QueryType::PathOffset(path_name) => write!(f, "(path {}, context {}, {})", path_name, context_str, self.output),
205            QueryType::PathInterval(path_name, len) => write!(f, "(path {}, len {}, context {}, {})", path_name, len, context_str, self.output),
206            QueryType::Nodes(nodes) => write!(f, "(nodes {:#?}, context {}, {})", nodes, context_str, self.output),
207            QueryType::Between((start, end), limit) => {
208                let (start_id, start_o) = support::decode_node(*start);
209                let (end_id, end_o) = support::decode_node(*end);
210                let limit_str = if let Some(limit) = limit {
211                    format!(", limit {}", limit)
212                } else {
213                    String::new()
214                };
215                write!(f, "(between ({} {}) and ({} {}){}, {})", start_id, start_o, end_id, end_o, limit_str, self.output)
216            },
217        }
218    }
219}
220
221//-----------------------------------------------------------------------------