sigalign_impl/pattern_index/
static_lfi.rs1use thiserror::Error;
2
3use crate::utils::get_unique_characters_of_sequence;
4use sigalign_core::reference::PatternIndex;
5use lt_fm_index::{
6 LtFmIndex, Block, blocks,
7};
8
9pub type Lfi32B2V64 = StaticLfi<blocks::Block2<u64>>;
11pub type Lfi32B3V64 = StaticLfi<blocks::Block3<u64>>;
13pub type Lfi32B4V64 = StaticLfi<blocks::Block4<u64>>;
15pub type Lfi32B5V64 = StaticLfi<blocks::Block5<u64>>;
17
18#[derive(Clone)]
22pub struct StaticLfi<B: Block<u32>> {
23 inner: LtFmIndex<u32, B>,
24}
25
26#[derive(Debug, Clone)]
27pub struct LfiOption {
29 pub suffix_array_sampling_ratio: u64,
30 pub lookup_table_max_bytes_size : u64,
31 pub use_safe_guard: bool,
32}
33impl LfiOption {
34 pub fn new(
35 suffix_array_sampling_ratio: u64,
36 lookup_table_max_bytes_size: u64,
37 use_safe_guard: bool,
38 ) -> Self {
39 Self {
40 suffix_array_sampling_ratio,
41 lookup_table_max_bytes_size,
42 use_safe_guard,
43 }
44 }
45}
46
47impl <B: Block<u32>> PatternIndex for StaticLfi<B> {
48 type Option = LfiOption;
49 type BuildError = LfiBuildError;
50
51 fn new(concatenated_sequence : Vec<u8>, option: Self::Option) -> Result<Self, Self::BuildError> {
52 let unique_sequence = get_unique_characters_of_sequence(&concatenated_sequence);
53 let mut valid_characters: Vec<Vec<u8>> = unique_sequence.into_iter().map(|v| vec![v]).collect();
54 if !option.use_safe_guard {
55 valid_characters.pop(); }
57 let characters_by_index: Vec<&[u8]> = valid_characters.iter()
58 .map(|v| v.as_slice())
59 .collect();
60 if characters_by_index.len() as u32 > B::MAX_CHR {
61 let err: LfiBuildError = Self::BuildError::OverMaximumCharacters {
62 max: B::MAX_CHR,
63 input: characters_by_index.len() as u32,
64 };
65 return Err(err);
66 }
67
68 let sequence_length = concatenated_sequence.len();
69 if sequence_length >= u32::MAX as usize {
70 return Err(Self::BuildError::SequenceLengthOver(u32::MAX as u64));
71 }
72 let lookup_table_kmer_size = calculate_lookup_table_kmer_size(
73 characters_by_index.len(),
74 option.lookup_table_max_bytes_size as usize,
75 );
76
77 match LtFmIndex::build(
78 concatenated_sequence,
79 &characters_by_index,
80 option.suffix_array_sampling_ratio as u32,
81 lookup_table_kmer_size,
82 ) {
83 Ok(v) => Ok(Self { inner: v }),
84 Err(err) => Err(Self::BuildError::InvalidOption(format!("{}", err))),
85 }
86 }
87 fn get_sorted_positions(&self, pattern: &[u8]) -> Vec<u32> {
88 let mut positions = self.inner.locate(pattern);
89 positions.sort_unstable();
90 positions
91 }
92}
93
94fn calculate_lookup_table_kmer_size(
95 chr_count: usize,
96 maximum_bytes_size: usize,
97) -> u32 {
98 let max_cap = 50;
99 for v in 1..=max_cap {
100 let estimated_byte_size_of_lt = (chr_count+1).pow(v);
101 if estimated_byte_size_of_lt >= maximum_bytes_size {
102 return v - 1
103 }
104 }
105 max_cap
106}
107
108#[derive(Debug, Error)]
110pub enum LfiBuildError {
111 #[error("Sequence length is over the maximum capacity {0}")]
113 SequenceLengthOver(u64),
114 #[error("Pattern index can make index of {max} characters, input is {input}")]
116 OverMaximumCharacters{
117 max: u32, input: u32, },
120 #[error("Error in option: {0}")]
122 InvalidOption(String), }
124
125use sigalign_core::reference::extensions::{
127 Serialize,
128 EstimateSize,
129};
130impl<B: Block<u32>> Serialize for StaticLfi<B> {
132 fn save_to<W>(&self, mut writer: W) -> Result<(), std::io::Error> where
133 W: std::io::Write
134 {
135 self.inner.save_to(&mut writer)?;
136 Ok(())
137 }
138 fn load_from<R>(mut reader: R) -> Result<Self, std::io::Error> where
139 R: std::io::Read,
140 Self: Sized
141 {
142 let inner = LtFmIndex::load_from(&mut reader)?;
143 Ok(Self { inner })
144 }
145}
146impl<B: Block<u32>> EstimateSize for StaticLfi<B> {
148 fn serialized_size(&self) -> usize {
149 self.inner.encoded_len()
150 }
151}