genedex/
lib.rs

1/*! This library contains an implementation of the FM-Index data structure ([original paper]).
2 *
3 * It is based on an encoding for the text with rank support data structure (a.k.a. occurrence table)
4 * by Simon Gene Gottlieb. This encoding attemps to provide a good trade-off between
5 * memory usage and running time of queries. Another traditional encoding is provided with higher memory usage,
6 * but faster query running times.
7 *
8 * The library supports creating an FM-Index for a set of texts over an [alphabet]. The index construction
9 * is based on the [`libsais-rs`] crate and parallelized.
10 *
11 * ## Usage
12 *
13 * The following is a basic example of how to use this library:
14 *
15 * ```
16 * use genedex::{FmIndexConfig, alphabet};
17 *
18 * let dna_n_alphabet = alphabet::ascii_dna_with_n();
19 * let texts = [b"aACGT", b"acGtn"];
20 *
21 * let index = FmIndexConfig::<i32>::new().construct_index(texts, dna_n_alphabet);
22 *
23 * let query = b"GT";
24 * assert_eq!(index.count(query), 2);
25 *
26 * for hit in index.locate(query) {
27 *     println!(
28 *         "Found query in text {} at position {}.",
29 *         hit.text_id, hit.position
30 *     );
31 * }
32 * ```
33 *
34 * More information about the flexible [cursor](Cursor) API, build [configuration](FmIndexConfig)
35 * and [variants](TextWithRankSupport) of the FM-Index can be found in the module-level and struct-level documentation.
36 *
37 * Optimized functions such as [`FmIndex::locate_many`] exist for searching multiple queries at once. They do not use
38 * multi-threading, but can still be significantly faster (around 2x) than calling the respective functions for single
39 * queries in a loop. The reason for the improved performance is that the queries are searched in batches, which allows
40 * different kinds of parallelism inside the CPU to be used. An example of how such a function is used can be found
41 * [here](https://github.com/feldroop/genedex/blob/master/examples/basic_usage.rs).
42 *
43 * [original paper]: https://doi.org/10.1109/SFCS.2000.892127
44 * [`libsais-rs`]: https://github.com/feldroop/libsais-rs
45 */
46
47/// Contains functions to create various commonly used alphabets.
48pub mod alphabet;
49
50/// Different implementations of the text with rank support (a.k.a. occurrence table) data structure that powers the FM-Index.
51///
52/// The [`TextWithRankSupport`] and [`Block`](text_with_rank_support::Block) traits are good places to start
53///  learning about this module.
54pub mod text_with_rank_support;
55
56mod batch_computed_cursors;
57mod config;
58mod construction;
59mod cursor;
60mod lookup_table;
61mod sampled_suffix_array;
62mod text_id_search_tree;
63
64use num_traits::NumCast;
65
66#[doc(inline)]
67pub use alphabet::Alphabet;
68#[doc(inline)]
69pub use config::FmIndexConfig;
70#[doc(inline)]
71pub use config::PerformancePriority;
72#[doc(inline)]
73pub use construction::IndexStorage;
74#[doc(inline)]
75pub use cursor::Cursor;
76
77use batch_computed_cursors::BatchComputedCursors;
78use construction::DataStructures;
79use lookup_table::LookupTables;
80use sampled_suffix_array::SampledSuffixArray;
81use text_id_search_tree::TexdIdSearchTree;
82use text_with_rank_support::{
83    Block64, Block512, CondensedTextWithRankSupport, FlatTextWithRankSupport, TextWithRankSupport,
84};
85
86/// The FM-Index data structure.
87///
88/// See [crate-level documentation](self) for details.
89#[cfg_attr(feature = "savefile", derive(savefile::savefile_derive::Savefile))]
90#[savefile_doc_hidden]
91#[derive(Clone)]
92pub struct FmIndex<I, R = CondensedTextWithRankSupport<I, Block64>> {
93    alphabet: Alphabet,
94    count: Vec<usize>,
95    text_with_rank_support: R,
96    suffix_array: SampledSuffixArray<I>,
97    text_ids: TexdIdSearchTree,
98    lookup_tables: LookupTables<I>,
99}
100
101/// A little faster than [`FmIndexCondensed512`], and still space efficient for larger alphabets.
102/// This is the default version.
103pub type FmIndexCondensed64<I> = FmIndex<I, CondensedTextWithRankSupport<I, Block64>>;
104
105/// The most space efficient version. Currently, this is experimental and the `*64` usually provide better trade-offs.
106pub type FmIndexCondensed512<I> = FmIndex<I, CondensedTextWithRankSupport<I, Block512>>;
107
108/// The fastest version.
109pub type FmIndexFlat64<I> = FmIndex<I, FlatTextWithRankSupport<I, Block64>>;
110
111/// A little smaller and slower than [`FmIndexFlat64`]. [`FmIndexCondensed64`] should be a better trade-off for most applications.
112pub type FmIndexFlat512<I> = FmIndex<I, FlatTextWithRankSupport<I, Block512>>;
113
114const BATCH_SIZE: usize = 64;
115
116impl<I: IndexStorage, R: TextWithRankSupport<I>> FmIndex<I, R> {
117    fn new<T: AsRef<[u8]>>(
118        texts: impl IntoIterator<Item = T>,
119        alphabet: Alphabet,
120        config: FmIndexConfig<I, R>,
121    ) -> Self {
122        let DataStructures {
123            count,
124            sampled_suffix_array,
125            text_ids,
126            text_with_rank_support,
127        } = construction::create_data_structures::<I, R, T>(texts, &config, &alphabet);
128
129        let num_searchable_dense_symbols = alphabet.num_searchable_dense_symbols();
130
131        let mut index = FmIndex {
132            alphabet,
133            count,
134            text_with_rank_support,
135            suffix_array: sampled_suffix_array,
136            text_ids,
137            lookup_tables: LookupTables::new_empty(),
138        };
139
140        lookup_table::fill_lookup_tables(
141            &mut index,
142            config.lookup_table_depth,
143            num_searchable_dense_symbols,
144        );
145
146        index
147    }
148
149    /// Returns the number of occurrences of `query` in the set of indexed texts.
150    ///
151    /// Running time is in O(`query.len() - d`), where d is the depth of the lookup table of the index.
152    pub fn count(&self, query: &[u8]) -> usize {
153        self.cursor_for_query(query).count()
154    }
155
156    /// The results of [`Self::count`] for multiple queries.
157    ///
158    /// The order of the queries is preserved for the counts. This function can improve the running
159    /// time when many queries are searched.
160    pub fn count_many<'a>(
161        &'a self,
162        queries: impl IntoIterator<Item = &'a [u8]>,
163    ) -> impl Iterator<Item = usize> {
164        self.cursors_for_many_queries(queries)
165            .map(|cursor| cursor.count())
166    }
167
168    /// Returns the occurrences of `query` in the set of indexed texts. The occurrences are not sorted by text id or position.
169    ///
170    /// The initial running time is the same as for [`count`](Self::count).
171    /// For each hit pulled from the iterator, a sampled suffix array lookup is performed.
172    /// This operation needs `s / 2` steps on average, where `s` is the suffix array
173    /// sampling rate of the index.
174    pub fn locate(&self, query: &[u8]) -> impl Iterator<Item = Hit> {
175        let cursor = self.cursor_for_query(query);
176
177        self.locate_interval(cursor.interval())
178    }
179
180    /// The results of [`Self::locate`] for multiple queries.
181    ///
182    /// The order of the queries is preserved for the hits. This function can improve the running
183    /// time when many queries are searched.
184    pub fn locate_many<'a>(
185        &'a self,
186        queries: impl IntoIterator<Item = &'a [u8]>,
187    ) -> impl Iterator<Item: Iterator<Item = Hit>> {
188        self.cursors_for_many_queries(queries)
189            .map(|cursor| self.locate_interval(cursor.interval()))
190    }
191
192    fn locate_interval(&self, interval: HalfOpenInterval) -> impl Iterator<Item = Hit> {
193        self.suffix_array
194            .recover_range(interval.start..interval.end, self)
195            .map(|idx| {
196                let (text_id, position) = self
197                    .text_ids
198                    .backtransfrom_concatenated_text_index(<usize as NumCast>::from(idx).unwrap());
199
200                Hit { text_id, position }
201            })
202    }
203
204    /// Returns a cursor to the index with the empty query currently searched.
205    ///
206    /// See [`Cursor`] for details. Running time is in `O(1)`.
207    pub fn cursor_empty<'a>(&'a self) -> Cursor<'a, I, R> {
208        Cursor {
209            index: self,
210            interval: HalfOpenInterval {
211                start: 0,
212                end: self.total_text_len(),
213            },
214        }
215    }
216
217    /// Returns a cursor to the index with `query` currently searched.
218    ///
219    /// See [`Cursor`] for details. Running time is the same as for [`count`](Self::count).
220    /// This allows using a lookup table jump and therefore can be more efficient than creating
221    /// an empty cursor and repeatedly calling [`Cursor::extend_query_front`].
222    pub fn cursor_for_query<'a>(&'a self, query: &[u8]) -> Cursor<'a, I, R> {
223        let (remaining_query, query_suffix) = self.split_query_for_lookup(query);
224        let interval = self.lookup_tables.lookup(query_suffix, &self.alphabet);
225
226        let mut cursor = Cursor {
227            index: self,
228            interval,
229        };
230
231        for &symbol in remaining_query.iter().rev() {
232            cursor.extend_query_front(symbol);
233
234            if cursor.count() == 0 {
235                break;
236            }
237        }
238
239        cursor
240    }
241
242    /// The results of [`Self::cursor_for_query`] for multiple queries.
243    ///
244    /// The order of the queries is preserved for the cursors. This function can improve the running
245    /// time when many queries are searched.
246    pub fn cursors_for_many_queries<'a>(
247        &'a self,
248        queries: impl IntoIterator<Item = &'a [u8]>,
249    ) -> impl Iterator<Item = Cursor<'a, I, R>> {
250        BatchComputedCursors::<I, R, _, BATCH_SIZE>::new(self, queries.into_iter())
251    }
252
253    fn cursor_for_query_without_alphabet_translation<'a>(
254        &'a self,
255        query: &[u8],
256    ) -> Cursor<'a, I, R> {
257        let (remaining_query, query_suffix) = self.split_query_for_lookup(query);
258        let interval = self
259            .lookup_tables
260            .lookup_without_alphabet_translation(query_suffix);
261
262        let mut cursor = Cursor {
263            index: self,
264            interval,
265        };
266
267        for &symbol in remaining_query.iter().rev() {
268            cursor.extend_front_without_alphabet_translation(symbol);
269
270            if cursor.count() == 0 {
271                break;
272            }
273        }
274
275        cursor
276    }
277
278    fn lf_mapping_step(&self, symbol: u8, idx: usize) -> usize {
279        self.count[symbol as usize] + self.text_with_rank_support.rank(symbol, idx)
280    }
281
282    fn split_query_for_lookup<'a>(&self, query: &'a [u8]) -> (&'a [u8], &'a [u8]) {
283        let lookup_depth = std::cmp::min(query.len(), self.lookup_tables.max_depth());
284        let suffix_idx = query.len() - lookup_depth;
285        query.split_at(suffix_idx)
286    }
287
288    pub fn alphabet(&self) -> &Alphabet {
289        &self.alphabet
290    }
291
292    pub fn num_texts(&self) -> usize {
293        self.text_ids.sentinel_indices.len()
294    }
295
296    /// The length of all the texts that this index is built on. The value includes a sentinel symbol for each text.
297    pub fn total_text_len(&self) -> usize {
298        self.text_with_rank_support.text_len()
299    }
300
301    #[cfg(feature = "savefile")]
302    const VERSION_FOR_SAVEFILE: u32 = 0;
303
304    #[cfg(feature = "savefile")]
305    pub fn load_from_reader(
306        reader: &mut impl std::io::Read,
307    ) -> Result<Self, savefile::SavefileError> {
308        savefile::load(reader, Self::VERSION_FOR_SAVEFILE)
309    }
310
311    #[cfg(feature = "savefile")]
312    pub fn load_from_file(
313        filepath: impl AsRef<std::path::Path>,
314    ) -> Result<Self, savefile::SavefileError> {
315        savefile::load_file(filepath, Self::VERSION_FOR_SAVEFILE)
316    }
317
318    #[cfg(feature = "savefile")]
319    pub fn save_to_writer(
320        &self,
321        writer: &mut impl std::io::Write,
322    ) -> Result<(), savefile::SavefileError> {
323        savefile::save(writer, Self::VERSION_FOR_SAVEFILE, self)
324    }
325
326    #[cfg(feature = "savefile")]
327    pub fn save_to_file(
328        &self,
329        filepath: impl AsRef<std::path::Path>,
330    ) -> Result<(), savefile::SavefileError> {
331        savefile::save_file(filepath, Self::VERSION_FOR_SAVEFILE, self)
332    }
333}
334
335/// Represents an occurrence of a searched query in the set of indexed texts.
336#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
337pub struct Hit {
338    pub text_id: usize,
339    pub position: usize,
340}
341
342#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
343pub(crate) struct HalfOpenInterval {
344    pub start: usize,
345    pub end: usize,
346}
347
348mod maybe_savefile {
349    #[cfg(feature = "savefile")]
350    pub trait MaybeSavefile: savefile::Savefile {}
351
352    #[cfg(not(feature = "savefile"))]
353    pub trait MaybeSavefile {}
354
355    impl MaybeSavefile for i32 {}
356    impl MaybeSavefile for u32 {}
357    impl MaybeSavefile for i64 {}
358}
359
360mod sealed {
361    pub trait Sealed {}
362}