sigalign_impl/pattern_index/
static_lfi.rs

1use thiserror::Error;
2
3use crate::utils::get_unique_characters_of_sequence;
4use sigalign_core::reference::PatternIndex;
5use lt_fm_index::{
6    LtFmIndex, Block, blocks,
7};
8
9/// `StaticLfi` that can index 3 (2^2 - 1) characters, with a BWT block size of 64.
10pub type Lfi32B2V64 = StaticLfi<blocks::Block2<u64>>;
11/// `StaticLfi` that can index 7 (2^3 - 1) characters, with a BWT block size of 64.
12pub type Lfi32B3V64 = StaticLfi<blocks::Block3<u64>>;
13/// `StaticLfi` that can index 15 (2^4 - 1) characters, with a BWT block size of 64.
14pub type Lfi32B4V64 = StaticLfi<blocks::Block4<u64>>;
15/// `StaticLfi` that can index 31 (2^5 - 1) characters, with a BWT block size of 64.
16pub type Lfi32B5V64 = StaticLfi<blocks::Block5<u64>>;
17
18// TODO: Check if the specification is accurate.
19/// LtFmIndex that has a maximum number of characters that can be indexed.
20/// (The maximum length of one sequence is u32::MAX)
21#[derive(Clone)]
22pub struct StaticLfi<B: Block<u32>> {
23    inner: LtFmIndex<u32, B>,
24}
25
26#[derive(Debug, Clone)]
27/// Option to define the structure of the LtFmIndex.
28pub struct LfiOption {
29    pub suffix_array_sampling_ratio: u64,
30    pub lookup_table_max_bytes_size : u64,
31    pub use_safe_guard: bool,
32}
33impl LfiOption {
34    pub fn new(
35        suffix_array_sampling_ratio: u64,
36        lookup_table_max_bytes_size: u64,
37        use_safe_guard: bool,
38    ) -> Self {
39        Self {
40            suffix_array_sampling_ratio,
41            lookup_table_max_bytes_size,
42            use_safe_guard,
43        }
44    }
45}
46
47impl <B: Block<u32>> PatternIndex for StaticLfi<B> {
48    type Option = LfiOption;
49    type BuildError = LfiBuildError;
50    
51    fn new(concatenated_sequence : Vec<u8>, option: Self::Option) -> Result<Self, Self::BuildError> {
52        let unique_sequence = get_unique_characters_of_sequence(&concatenated_sequence);
53        let mut valid_characters: Vec<Vec<u8>> = unique_sequence.into_iter().map(|v| vec![v]).collect();
54        if !option.use_safe_guard {
55            valid_characters.pop(); // Remove last character
56        }
57        let characters_by_index: Vec<&[u8]> = valid_characters.iter()
58            .map(|v| v.as_slice())
59            .collect();
60        if characters_by_index.len() as u32 > B::MAX_CHR {
61            let err: LfiBuildError = Self::BuildError::OverMaximumCharacters {
62                max: B::MAX_CHR,
63                input: characters_by_index.len() as u32,
64            };
65            return Err(err);
66        }
67
68        let sequence_length = concatenated_sequence.len();
69        if sequence_length >= u32::MAX as usize {
70            return Err(Self::BuildError::SequenceLengthOver(u32::MAX as u64));
71        }
72        let lookup_table_kmer_size = calculate_lookup_table_kmer_size(
73            characters_by_index.len(),
74            option.lookup_table_max_bytes_size as usize,
75        );
76
77        match LtFmIndex::build(
78            concatenated_sequence,
79            &characters_by_index,
80            option.suffix_array_sampling_ratio as u32,
81            lookup_table_kmer_size,
82        ) {
83            Ok(v) => Ok(Self { inner: v }),
84            Err(err) => Err(Self::BuildError::InvalidOption(format!("{}", err))),
85        }
86    }
87    fn get_sorted_positions(&self, pattern: &[u8]) -> Vec<u32> {
88        let mut positions = self.inner.locate(pattern);
89        positions.sort_unstable();
90        positions
91    }
92}
93
94fn calculate_lookup_table_kmer_size(
95    chr_count: usize,
96    maximum_bytes_size: usize,
97) -> u32 {
98    let max_cap = 50;
99    for v in 1..=max_cap {
100        let estimated_byte_size_of_lt = (chr_count+1).pow(v);
101        if estimated_byte_size_of_lt >= maximum_bytes_size {
102            return v - 1
103        }
104    }
105    max_cap
106}
107
108/// Error type for `StaticLfi` build.
109#[derive(Debug, Error)]
110pub enum LfiBuildError {
111    /// Triggered when sequence length exceeds the maximum allowable capacity.
112    #[error("Sequence length is over the maximum capacity {0}")]
113    SequenceLengthOver(u64),
114    /// Triggered when input characters exceed the maximum limit that the `PatternIndex` can index.
115    #[error("Pattern index can make index of {max} characters, input is {input}")]
116    OverMaximumCharacters{
117        max: u32,    // The maximum number of characters that PatternIndex can index
118        input: u32,  // Input characters
119    },
120    /// Triggered when the invalid option is passed.
121    #[error("Error in option: {0}")]
122    InvalidOption(String), // Error message
123}
124
125// Impl Extensions
126use sigalign_core::reference::extensions::{
127    Serialize,
128    EstimateSize,
129};
130//  - Serialize
131impl<B: Block<u32>> Serialize for StaticLfi<B> {
132    fn save_to<W>(&self, mut writer: W) -> Result<(), std::io::Error> where
133        W: std::io::Write
134    {
135        self.inner.save_to(&mut writer)?;
136        Ok(())
137    }
138    fn load_from<R>(mut reader: R) -> Result<Self, std::io::Error> where
139        R: std::io::Read,
140        Self: Sized
141    {
142        let inner = LtFmIndex::load_from(&mut reader)?;
143        Ok(Self { inner })
144    }
145}
146//  - EstimateSize
147impl<B: Block<u32>> EstimateSize for StaticLfi<B> {
148    fn serialized_size(&self) -> usize {
149        self.inner.encoded_len()
150    }
151}