Skip to main content

gbz_base/
path_index.rs

1//! And index for random access to GBZ paths by sequence offsets.
2//!
3//! This index extends the functionality of a [`GBZ`] graph to match [`crate::GraphInterface`].
4
5use gbz::{GBZ, Pos, FullPathName};
6
7use simple_sds::sparse_vector::{SparseVector, SparseBuilder};
8use simple_sds::ops::PredSucc;
9
10use std::collections::HashMap;
11
12#[cfg(test)]
13mod tests;
14
15//-----------------------------------------------------------------------------
16
17// TODO: Should this be in the `gbz` crate?
18/// An index for random access to reference and generic paths in a GBZ graph.
19///
20/// Indexed paths are identified by their offsets in the index.
21/// The offsets range from 0 to `path_count() - 1`.
22///
23/// The combination of [`GBZ`] and [`PathIndex`] is functionally similar to [`crate::GraphInterface`] but tens of times faster.
24/// An in-memory graph is better for batch operations, where the loading time is a negligible fraction of the total time.
25/// The database is better for interactive applications, where the user works with relatively small subgraphs.
26/// For a human graph, the database should be faster than the in-memory graph for subgraphs of up to 1 Mbp.
27///
28/// # Examples
29///
30/// ```
31/// use gbz_base::PathIndex;
32/// use gbz::{GBZ, FullPathName, Pos};
33/// use gbz::support;
34/// use simple_sds::serialize;
35///
36/// let filename = support::get_test_data("example.gbz");
37/// let graph: GBZ = serialize::load_from(&filename).unwrap();
38///
39/// // Create a path index with 3 bp intervals.
40/// let path_index = PathIndex::new(&graph, 3, false);
41/// assert!(path_index.is_ok());
42/// let path_index = path_index.unwrap();
43///
44/// // We have two components with one generic path in each.
45/// assert_eq!(path_index.path_count(), 2);
46/// assert_eq!(path_index.path_length(0), Some(5));
47/// assert_eq!(path_index.path_length(1), Some(4));
48///
49/// // Consider the generic path in component A.
50/// let path_name = FullPathName::generic("A");
51/// let index_offset = path_index.find_path(&graph, &path_name);
52/// assert_eq!(index_offset, Some(0));
53/// let index_offset = index_offset.unwrap();
54///
55/// // There should be two indexed positions for the path.
56/// let first_sample = path_index.indexed_position(index_offset, 2);
57/// assert_eq!(first_sample, Some((0, Pos::new(22, 0))));
58/// let second_sample = path_index.indexed_position(index_offset, 5);
59/// assert_eq!(second_sample, Some((3, Pos::new(30, 0))));
60/// let next_sample = path_index.indexed_position(index_offset, 100);
61/// assert_eq!(next_sample, second_sample);
62/// ```
63#[derive(Clone, Debug, PartialEq, Eq)]
64pub struct PathIndex {
65    // Maps path identifiers to index offsets.
66    path_to_offset: HashMap<usize, usize>,
67
68    // Maps index offsets to path identifiers.
69    offset_to_path: Vec<usize>,
70
71    // Sequence lengths for each path in bp.
72    path_lengths: Vec<usize>,
73
74    // Indexed sequence positions for each path.
75    sequence_positions: Vec<SparseVector>,
76
77    // GBWT positions corresponding to the indexed sequence positions.
78    gbwt_positions: Vec<Vec<Pos>>,
79}
80
81//-----------------------------------------------------------------------------
82
83impl PathIndex {
84    /// Creates a new path index for the given GBZ graph.
85    ///
86    /// The index is built for all reference and generic paths.
87    ///
88    /// # Arguments
89    ///
90    /// * `graph`: A GBZ graph.
91    /// * `interval`: Approximate distance between indexed positions (in bp).
92    /// * `verbose`: Print progress information to stderr.
93    pub fn new(graph: &GBZ, interval: usize, verbose: bool) -> Result<Self, String> {
94        if verbose {
95            eprintln!("Building path index");
96        }
97
98        // NOTE: For fragmented haplotypes, the reference positions are relative
99        // to the start of the fragment.
100        let reference_paths = graph.reference_positions(interval, verbose);
101        if reference_paths.is_empty() {
102            return Err(String::from("No reference paths to index"));
103        }
104
105        let mut path_to_offset: HashMap<usize, usize> = HashMap::with_capacity(reference_paths.len());
106        let mut offset_to_path: Vec<usize> = vec![0; reference_paths.len()];
107        let mut path_lengths: Vec<usize> = Vec::with_capacity(reference_paths.len());
108        let mut sequence_positions = Vec::with_capacity(reference_paths.len());
109        let mut gbwt_positions = Vec::with_capacity(reference_paths.len());
110        for (offset, ref_path) in reference_paths.iter().enumerate() {
111            path_to_offset.insert(ref_path.id, offset);
112            offset_to_path[offset] = ref_path.id;
113            path_lengths.push(ref_path.len);
114            let mut sequence = SparseBuilder::new(ref_path.len, ref_path.positions.len())?;
115            let mut gbwt = Vec::with_capacity(ref_path.positions.len());
116            for (sequence_pos, gbwt_pos) in ref_path.positions.iter() {
117                sequence.set(*sequence_pos);
118                gbwt.push(*gbwt_pos);
119            }
120            sequence_positions.push(SparseVector::try_from(sequence)?);
121            gbwt_positions.push(gbwt);
122        }
123
124        Ok(PathIndex { path_to_offset, offset_to_path, path_lengths, sequence_positions, gbwt_positions })
125    }
126
127    /// Returns the number of indexed paths.
128    #[inline]
129    pub fn path_count(&self) -> usize {
130        self.sequence_positions.len()
131    }
132
133    /// Returns the index offset for the indexed path with the given identifier.
134    ///
135    /// Returns [`None`] if the path does not exist or it has not been indexed.
136    #[inline]
137    pub fn path_to_offset(&self, path_id: usize) -> Option<usize> {
138        self.path_to_offset.get(&path_id).cloned()
139    }
140
141    /// Returns the path identifier for the indexed path with the given index offset.
142    ///
143    /// Returns [`None`] if the path does not exist.
144    #[inline]
145    pub fn offset_to_path(&self, index_offset: usize) -> Option<usize> {
146        self.offset_to_path.get(index_offset).cloned()
147    }
148
149    /// Returns the index offset for the path with the given metadata.
150    ///
151    /// Returns [`None`] if the path does not exist or it has not been indexed.
152    pub fn find_path(&self, graph: &GBZ, path_name: &FullPathName) -> Option<usize> {
153        let metadata = graph.metadata()?;
154        let path_id = metadata.find_path(path_name)?;
155        self.path_to_offset(path_id)
156    }
157
158    /// Returns the length of the indexed path with the given index offset.
159    ///
160    /// Returns [`None`] if the path does not exist.
161    #[inline]
162    pub fn path_length(&self, index_offset: usize) -> Option<usize> {
163        self.path_lengths.get(index_offset).cloned()
164    }
165
166    /// Returns the last indexed position at or before `offset` on the path with name `path_name`.
167    ///
168    /// The return value consists of a sequence offset and a GBWT position.
169    /// Returns [`None`] if the path does not exist or it has not been indexed.
170    /// This is similar to [`crate::GraphInterface::find_path`] followed by [`crate::GraphInterface::indexed_position`].
171    ///
172    /// # Arguments
173    ///
174    /// * `index_offset`: Offset of the path in the index.
175    /// * `query_offset`: Sequence position in the path (in bp).
176    pub fn indexed_position(&self, index_offset: usize, query_offset: usize) -> Option<(usize, Pos)> {
177        let mut iter = self.sequence_positions[index_offset].predecessor(query_offset);
178        if let Some((sample_offset, sequence_offset)) = iter.next() {
179            let gbwt_pos = self.gbwt_positions[index_offset][sample_offset];
180            Some((sequence_offset, gbwt_pos))
181        } else {
182            None
183        }
184    }
185}
186
187//-----------------------------------------------------------------------------