1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
use crate::{
Alphabet, FmIndex, IndexStorage,
text_with_rank_support::{Block64, CondensedTextWithRankSupport, TextWithRankSupport},
};
use std::marker::PhantomData;
/// A builder-like API to configure and construct the FM-Index.
#[derive(Clone, Copy)]
pub struct FmIndexConfig<I, R = CondensedTextWithRankSupport<I, Block64>> {
pub(crate) suffix_array_sampling_rate: usize,
pub(crate) lookup_table_depth: usize,
pub(crate) performance_priority: PerformancePriority,
_index_storage_marker: PhantomData<I>,
_block_marker: PhantomData<R>,
}
impl<I: IndexStorage, R: TextWithRankSupport<I>> FmIndexConfig<I, R> {
pub fn new() -> Self {
Self::default()
}
/// The FM-Index internally stores a suffix array. Every entry of this array at a position
/// divisible by `suffix_array_sampling_rate` is retained. For example, a rate of 3
/// would retain every third entry of the suffix array.
///
/// A larger rate leads to less memory usage, but higher locate running time. The default is `4`.
pub fn suffix_array_sampling_rate(self, suffix_array_sampling_rate: usize) -> Self {
assert!(suffix_array_sampling_rate > 0);
Self {
suffix_array_sampling_rate,
..self
}
}
/// The FM-Index stores a lookup table to skip the first `lookup_table_depth` many search steps
/// when searching a query. The size of the lookup table grows exponentially in its depth,
/// with the number of searchable alphabet symbols as base. The default is `8`.
///
/// For large texts like genomes and small alphabets like DNA alphabets with 4 searchable symbols,
/// values up to around `13` might be reasonable choices.
pub fn lookup_table_depth(self, lookup_table_depth: usize) -> Self {
Self {
lookup_table_depth,
..self
}
}
/// See [`PerformancePriority`] for details.
pub fn construction_performance_priority(
self,
performance_priority: PerformancePriority,
) -> Self {
Self {
performance_priority,
..self
}
}
/// Construct the FM-Index.
///
/// The number of threads for the build procedure is controlled by [`rayon`].
pub fn construct_index<T: AsRef<[u8]>>(
self,
texts: impl IntoIterator<Item = T>,
alphabet: Alphabet,
) -> FmIndex<I, R> {
FmIndex::new(texts, alphabet, self)
}
}
impl<I: IndexStorage, R: TextWithRankSupport<I>> Default for FmIndexConfig<I, R> {
fn default() -> Self {
Self {
suffix_array_sampling_rate: 4,
lookup_table_depth: 8,
performance_priority: PerformancePriority::HighSpeed,
_index_storage_marker: PhantomData,
_block_marker: PhantomData,
}
}
}
/// This enum can be supplied to the [`FmIndexConfig`] to select different sub-algorithms during the
/// construction of the FM-Index.
///
/// The default is [`HighSpeed`](PerformancePriority::HighSpeed).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PerformancePriority {
HighSpeed,
/// For alphabets with 16 or less symbols in dense encoding, the temporary concatenated text and
/// the BWT buffer can be compressed to use less memory while constructing the BWT. This reduces the peak memory usage
/// of the construction by about 10-15% and only takes a small amount of additional running time.
Balanced,
/// In addition to the space improvements of the `Balanced` variant, a much slower, not parallel suffix
/// array construction algorithm will be used for `u32`-based FM-Indices.
/// This only happens if the `u32-saca` feature is activated (by default it is).
/// This can save a lot of memory when the sum of text lengths fits into a `u32`, but not into a `i32`.
/// The downside is that the construction is much slower (at least 5 times slower should be expected).
LowMemory,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_config() {
let texts = [b"ACGT"];
let alphabet = crate::alphabet::ascii_dna();
let _index = FmIndexConfig::<i32>::new()
.lookup_table_depth(5)
.suffix_array_sampling_rate(8)
.construct_index(texts, alphabet);
}
}