sview_fmindex/builder/
mod.rs

1use std::marker::PhantomData;
2
3use crate::{
4    // traits
5    Position, Block,
6    components::{
7        Header, View,
8        // headers
9        MagicNumber, TextEncoder, CountArrayHeader, SuffixArrayHeader, BwmHeader,
10        // views
11        CountArrayView, SuffixArrayView, BwmView,
12    },
13};
14
15pub mod build_config;
16
17/// Builder for FM-index
18pub struct FmIndexBuilder<P: Position, B: Block, E: TextEncoder> {
19    // Unchangeable after init
20    text_len: usize,
21    symbol_count: u32,
22    magic_number: MagicNumber,
23    text_encoder: E,
24    // Configs
25    suffix_array_config: build_config::SuffixArrayConfig,
26    lookup_table_config: build_config::LookupTableConfig,
27    // Changeable after init
28    count_array_header: CountArrayHeader,
29    suffix_array_header: SuffixArrayHeader,
30    bwm_header: BwmHeader,
31    // Phantom data
32    _phantom: PhantomData<(P, B)>,
33}
34
35/// Error type for the builder
36#[derive(Debug, thiserror::Error)]
37pub enum BuildError {
38    /// The number of distinct symbols exceeds the capacity of the chosen block type.
39    #[error("The symbol count ({1}) exceeds the maximum for the chosen block type ({0}). Try using a larger block type or reducing the number of symbols.")]
40    SymbolCountOver(u32, u32),
41
42    /// The length of the provided text does not match the length declared during builder initialization.
43    #[error("Mismatched text length: expected {0} bytes, but got {1} bytes.")]
44    UnmatchedTextLength(usize, usize),
45
46    /// The provided blob slice has an incorrect size.
47    #[error("Incorrect blob size: expected {0} bytes, but got {1} bytes.")]
48    InvalidBlobSize(usize, usize),
49    
50    /// The provided blob slice is not properly aligned.
51    #[error("Improper blob alignment: required alignment is {0} bytes, but the blob has an offset of {1} bytes.")]
52    NotAlignedBlob(usize, usize),
53
54    /// An invalid build configuration was provided.
55    #[error("Invalid build configuration: {0}")]
56    InvalidConfig(String),
57}
58
59impl<P: Position, B: Block, E: TextEncoder> FmIndexBuilder<P, B, E> {
60    // ================================================
61    // Set up builder
62    // ================================================
63    pub fn new(
64        text_len: usize,
65        symbol_count: u32,
66        text_encoder: E,
67    ) -> Result<Self, BuildError> {
68        let suffix_array_config = build_config::SuffixArrayConfig::default();
69        let lookup_table_config = build_config::LookupTableConfig::default();
70
71        if symbol_count > B::MAX_SYMBOL {
72            return Err(BuildError::SymbolCountOver(B::MAX_SYMBOL, symbol_count));
73        }
74
75        // Generate headers
76        let (count_array_header, suffix_array_header, bwm_header) = Self::generate_headers(
77            text_len,
78            symbol_count,
79            &suffix_array_config,
80            &lookup_table_config,
81        )?;
82
83        Ok(Self {
84            // Unchangeable after init
85            text_len,
86            symbol_count,
87            magic_number: MagicNumber::new(),
88            text_encoder,
89            // Configs
90            lookup_table_config,
91            suffix_array_config,
92            // Changeable after init
93            count_array_header,
94            suffix_array_header,
95            bwm_header,
96            // Phantom data
97            _phantom: PhantomData,
98        })
99    }
100    fn generate_headers(
101        text_len: usize,
102        symbol_count: u32,
103        suffix_array_config: &build_config::SuffixArrayConfig,
104        lookup_table_config: &build_config::LookupTableConfig,
105    ) -> Result<(CountArrayHeader, SuffixArrayHeader, BwmHeader), BuildError> {
106        let lookup_table_kmer_size = lookup_table_config.kmer_size::<P>(symbol_count)?;
107        let suffix_array_sampling_ratio = suffix_array_config.sampling_ratio()?;
108
109        let count_array_header = CountArrayHeader::new(
110            symbol_count,
111            lookup_table_kmer_size,
112        );
113        let suffix_array_header = SuffixArrayHeader::new(
114            text_len as u64,
115            suffix_array_sampling_ratio,
116        );
117        let bwm_header = BwmHeader::new::<P, B>(
118            text_len as u64,
119            symbol_count,
120        );
121
122        Ok((
123            count_array_header,
124            suffix_array_header,
125            bwm_header,
126        ))
127    }
128    pub fn set_lookup_table_config(self, config: build_config::LookupTableConfig) -> Result<Self, BuildError> {
129        let (count_array_header, suffix_array_header, bwm_header) = Self::generate_headers(
130            self.text_len,
131            self.symbol_count,
132            &self.suffix_array_config,
133            &config,
134        )?;
135
136        Ok(Self {
137            lookup_table_config: config,
138            count_array_header,
139            suffix_array_header,
140            bwm_header,
141            ..self
142        })
143    }
144    pub fn set_suffix_array_config(self, config: build_config::SuffixArrayConfig) -> Result<Self, BuildError> {
145        let (count_array_header, suffix_array_header, bwm_header) = Self::generate_headers(
146            self.text_len,
147            self.symbol_count,
148            &config,
149            &self.lookup_table_config,
150        )?;
151
152        Ok(Self {
153            suffix_array_config: config,
154            count_array_header,
155            suffix_array_header,
156            bwm_header,
157            ..self
158        })
159    }
160
161    // ================================================
162    // Blob size calculation
163    // ================================================
164    /// Calculate the total size of the blob in bytes
165    pub fn blob_size(&self) -> usize {
166        self.header_size() + self.body_size()
167    }
168    // Header size in bytes
169    fn header_size(&self) -> usize {
170        self.magic_number.aligned_size::<B>()
171        + self.text_encoder.aligned_size::<B>()
172        + self.count_array_header.aligned_size::<B>()
173        + self.suffix_array_header.aligned_size::<B>()
174        + self.bwm_header.aligned_size::<B>()
175    }
176    // Body size in bytes
177    fn body_size(&self) -> usize {
178        CountArrayView::<P>::aligned_body_size::<B>(&self.count_array_header)
179        + SuffixArrayView::<P>::aligned_body_size::<B>(&self.suffix_array_header) 
180        + BwmView::<P, B>::aligned_body_size::<B>(&self.bwm_header)
181    }
182
183    // ================================================
184    // Build
185    // ================================================
186    /// Build the FM-index and write to the provided blob slice
187    pub fn build<'a>(
188        &self,
189        mut text: Vec<u8>,
190        blob: &'a mut [u8],
191    ) -> Result<(), BuildError> {
192        // Check text length
193        if text.len() != self.text_len {
194            return Err(BuildError::UnmatchedTextLength(self.text_len, text.len()));
195        }
196
197        // Check alignment
198        let required_alignment = B::ALIGN_SIZE;
199        let offset = blob.as_ptr() as usize % required_alignment;
200        if offset != 0 {
201            return Err(BuildError::NotAlignedBlob(required_alignment, offset));
202        }
203
204        // Check blob size
205        let blob_size = self.blob_size();
206        let blob_size_actual = blob.len();
207        if blob_size != blob_size_actual {
208            return Err(BuildError::InvalidBlobSize(blob_size, blob_size_actual));
209        }
210
211        // 1) Write headers
212        let mut header_start_index = 0;
213        // Magic number
214        let mut header_end_index = self.magic_number.aligned_size::<B>();
215        self.magic_number.write_to_blob(&mut blob[header_start_index..header_end_index]);
216        // Encoding table
217        header_start_index = header_end_index;
218        header_end_index += self.text_encoder.aligned_size::<B>();
219        self.text_encoder.write_to_blob(&mut blob[header_start_index..header_end_index]);
220        // Count array header
221        header_start_index = header_end_index;
222        header_end_index += self.count_array_header.aligned_size::<B>();
223        self.count_array_header.write_to_blob(&mut blob[header_start_index..header_end_index]);
224        // Suffix array header
225        header_start_index = header_end_index;
226        header_end_index += self.suffix_array_header.aligned_size::<B>();
227        self.suffix_array_header.write_to_blob(&mut blob[header_start_index..header_end_index]);
228        // BWM header
229        header_start_index = header_end_index;
230        header_end_index += self.bwm_header.aligned_size::<B>();
231        self.bwm_header.write_to_blob(&mut blob[header_start_index..header_end_index]);
232
233        // 2) Build & write bodies
234        let mut body_start_index = header_end_index;
235        let mut body_end_index = body_start_index + CountArrayView::<P>::aligned_body_size::<B>(&self.count_array_header);
236        // Count array
237        //  - encode text with encoding table
238        //  - during encoding, count the number of each character & kmer
239        self.count_array_header.count_and_encode_text::<P, B, E>(
240            &mut text,
241            &self.text_encoder,
242            &mut blob[body_start_index..body_end_index],
243        );
244        // Suffix array
245        //  - burrow-wheeler transform
246        //  - get sentinel character index
247        body_start_index = body_end_index;
248        body_end_index = body_start_index + SuffixArrayView::<P>::aligned_body_size::<B>(&self.suffix_array_header);
249
250        let sentinel_index = self.suffix_array_header.write_to_blob_and_get_sentinel_index::<P>(
251            &mut text,
252            &mut blob[body_start_index..body_end_index],
253        );
254        // BWM
255        body_start_index = body_end_index;
256        body_end_index = body_start_index + BwmView::<P, B>::aligned_body_size::<B>(&self.bwm_header);
257        self.bwm_header.encode_bwm_body::<P, B>(
258            text,
259            sentinel_index, 
260            &mut blob[body_start_index..body_end_index],
261        );
262
263        Ok(())
264    }
265}