gbz_base/path_index.rs
1//! And index for random access to GBZ paths by sequence offsets.
2//!
3//! This index extends the functionality of a [`GBZ`] graph to match [`crate::GraphInterface`].
4
5use gbz::{GBZ, Pos, FullPathName};
6
7use simple_sds::sparse_vector::{SparseVector, SparseBuilder};
8use simple_sds::ops::PredSucc;
9
10use std::collections::HashMap;
11
12#[cfg(test)]
13mod tests;
14
15//-----------------------------------------------------------------------------
16
17// TODO: Should this be in the `gbz` crate?
18/// An index for random access to reference and generic paths in a GBZ graph.
19///
20/// Indexed paths are identified by their offsets in the index.
21/// The offsets range from 0 to `path_count() - 1`.
22///
23/// The combination of [`GBZ`] and [`PathIndex`] is functionally similar to [`crate::GraphInterface`] but tens of times faster.
24/// An in-memory graph is better for batch operations, where the loading time is a negligible fraction of the total time.
25/// The database is better for interactive applications, where the user works with relatively small subgraphs.
26/// For a human graph, the database should be faster than the in-memory graph for subgraphs of up to 1 Mbp.
27///
28/// # Examples
29///
30/// ```
31/// use gbz_base::PathIndex;
32/// use gbz::{GBZ, FullPathName, Pos};
33/// use gbz::support;
34/// use simple_sds::serialize;
35///
36/// let filename = support::get_test_data("example.gbz");
37/// let graph: GBZ = serialize::load_from(&filename).unwrap();
38///
39/// // Create a path index with 3 bp intervals.
40/// let path_index = PathIndex::new(&graph, 3, false);
41/// assert!(path_index.is_ok());
42/// let path_index = path_index.unwrap();
43///
44/// // We have two components with one generic path in each.
45/// assert_eq!(path_index.path_count(), 2);
46/// assert_eq!(path_index.path_length(0), Some(5));
47/// assert_eq!(path_index.path_length(1), Some(4));
48///
49/// // Consider the generic path in component A.
50/// let path_name = FullPathName::generic("A");
51/// let index_offset = path_index.find_path(&graph, &path_name);
52/// assert_eq!(index_offset, Some(0));
53/// let index_offset = index_offset.unwrap();
54///
55/// // There should be two indexed positions for the path.
56/// let first_sample = path_index.indexed_position(index_offset, 2);
57/// assert_eq!(first_sample, Some((0, Pos::new(22, 0))));
58/// let second_sample = path_index.indexed_position(index_offset, 5);
59/// assert_eq!(second_sample, Some((3, Pos::new(30, 0))));
60/// let next_sample = path_index.indexed_position(index_offset, 100);
61/// assert_eq!(next_sample, second_sample);
62/// ```
63#[derive(Clone, Debug, PartialEq, Eq)]
64pub struct PathIndex {
65 // Maps path identifiers to index offsets.
66 path_to_offset: HashMap<usize, usize>,
67
68 // Maps index offsets to path identifiers.
69 offset_to_path: Vec<usize>,
70
71 // Sequence lengths for each path in bp.
72 path_lengths: Vec<usize>,
73
74 // Indexed sequence positions for each path.
75 sequence_positions: Vec<SparseVector>,
76
77 // GBWT positions corresponding to the indexed sequence positions.
78 gbwt_positions: Vec<Vec<Pos>>,
79}
80
81//-----------------------------------------------------------------------------
82
83impl PathIndex {
84 /// Creates a new path index for the given GBZ graph.
85 ///
86 /// The index is built for all reference and generic paths.
87 ///
88 /// # Arguments
89 ///
90 /// * `graph`: A GBZ graph.
91 /// * `interval`: Approximate distance between indexed positions (in bp).
92 /// * `verbose`: Print progress information to stderr.
93 pub fn new(graph: &GBZ, interval: usize, verbose: bool) -> Result<Self, String> {
94 if verbose {
95 eprintln!("Building path index");
96 }
97
98 // NOTE: For fragmented haplotypes, the reference positions are relative
99 // to the start of the fragment.
100 let reference_paths = graph.reference_positions(interval, verbose);
101 if reference_paths.is_empty() {
102 return Err(String::from("No reference paths to index"));
103 }
104
105 let mut path_to_offset: HashMap<usize, usize> = HashMap::with_capacity(reference_paths.len());
106 let mut offset_to_path: Vec<usize> = vec![0; reference_paths.len()];
107 let mut path_lengths: Vec<usize> = Vec::with_capacity(reference_paths.len());
108 let mut sequence_positions = Vec::with_capacity(reference_paths.len());
109 let mut gbwt_positions = Vec::with_capacity(reference_paths.len());
110 for (offset, ref_path) in reference_paths.iter().enumerate() {
111 path_to_offset.insert(ref_path.id, offset);
112 offset_to_path[offset] = ref_path.id;
113 path_lengths.push(ref_path.len);
114 let mut sequence = SparseBuilder::new(ref_path.len, ref_path.positions.len())?;
115 let mut gbwt = Vec::with_capacity(ref_path.positions.len());
116 for (sequence_pos, gbwt_pos) in ref_path.positions.iter() {
117 sequence.set(*sequence_pos);
118 gbwt.push(*gbwt_pos);
119 }
120 sequence_positions.push(SparseVector::try_from(sequence)?);
121 gbwt_positions.push(gbwt);
122 }
123
124 Ok(PathIndex { path_to_offset, offset_to_path, path_lengths, sequence_positions, gbwt_positions })
125 }
126
127 /// Returns the number of indexed paths.
128 #[inline]
129 pub fn path_count(&self) -> usize {
130 self.sequence_positions.len()
131 }
132
133 /// Returns the index offset for the indexed path with the given identifier.
134 ///
135 /// Returns [`None`] if the path does not exist or it has not been indexed.
136 #[inline]
137 pub fn path_to_offset(&self, path_id: usize) -> Option<usize> {
138 self.path_to_offset.get(&path_id).cloned()
139 }
140
141 /// Returns the path identifier for the indexed path with the given index offset.
142 ///
143 /// Returns [`None`] if the path does not exist.
144 #[inline]
145 pub fn offset_to_path(&self, index_offset: usize) -> Option<usize> {
146 self.offset_to_path.get(index_offset).cloned()
147 }
148
149 /// Returns the index offset for the path with the given metadata.
150 ///
151 /// Returns [`None`] if the path does not exist or it has not been indexed.
152 pub fn find_path(&self, graph: &GBZ, path_name: &FullPathName) -> Option<usize> {
153 let metadata = graph.metadata()?;
154 let path_id = metadata.find_path(path_name)?;
155 self.path_to_offset(path_id)
156 }
157
158 /// Returns the length of the indexed path with the given index offset.
159 ///
160 /// Returns [`None`] if the path does not exist.
161 #[inline]
162 pub fn path_length(&self, index_offset: usize) -> Option<usize> {
163 self.path_lengths.get(index_offset).cloned()
164 }
165
166 /// Returns the last indexed position at or before `offset` on the path with name `path_name`.
167 ///
168 /// The return value consists of a sequence offset and a GBWT position.
169 /// Returns [`None`] if the path does not exist or it has not been indexed.
170 /// This is similar to [`crate::GraphInterface::find_path`] followed by [`crate::GraphInterface::indexed_position`].
171 ///
172 /// # Arguments
173 ///
174 /// * `index_offset`: Offset of the path in the index.
175 /// * `query_offset`: Sequence position in the path (in bp).
176 pub fn indexed_position(&self, index_offset: usize, query_offset: usize) -> Option<(usize, Pos)> {
177 let mut iter = self.sequence_positions[index_offset].predecessor(query_offset);
178 if let Some((sample_offset, sequence_offset)) = iter.next() {
179 let gbwt_pos = self.gbwt_positions[index_offset][sample_offset];
180 Some((sequence_offset, gbwt_pos))
181 } else {
182 None
183 }
184 }
185}
186
187//-----------------------------------------------------------------------------