sshash_lib/builder/
dictionary_builder.rs1use crate::{
12 builder::{
13 buckets::{classify_into_buckets, BucketStatistics},
14 config::BuildConfiguration,
15 encode::Encoder,
16 minimizer_tuples::{compute_minimizer_tuples, compute_minimizer_tuples_external, needs_external_sorting},
17 },
18 dictionary::Dictionary,
19 kmer::{Kmer, KmerBits},
20 minimizers_control_map::{MinimizersControlMapBuilder, BucketType},
21 sparse_and_skew_index::SparseAndSkewIndex,
22 spectrum_preserving_string_set::SpectrumPreservingStringSet,
23};
24use tracing::info;
25
26pub struct DictionaryBuilder {
28 config: BuildConfiguration,
29}
30
31impl DictionaryBuilder {
32 pub fn new(config: BuildConfiguration) -> Result<Self, String> {
34 config.validate()?;
35 Ok(Self { config })
36 }
37
38 pub fn build_from_sequences(&self, sequences: Vec<String>) -> Result<Dictionary, String> {
52 let pool = rayon::ThreadPoolBuilder::new()
55 .num_threads(self.config.num_threads)
56 .build()
57 .map_err(|e| format!("Failed to create thread pool: {e}"))?;
58
59 pool.install(|| self.build_from_sequences_inner(sequences))
60 }
61
62 fn build_from_sequences_inner(&self, sequences: Vec<String>) -> Result<Dictionary, String> {
64 self.config.print();
65 info!("Building SSHash Dictionary");
66
67 info!("Step 1: Encoding sequences...");
69 let (spss, num_sequences) = self.encode_sequences(sequences)?;
70 info!(" Encoded {} sequences", num_sequences);
71 info!(" Total bases: {}", spss.total_bases());
72
73 info!("Step 2: Extracting and coalescing minimizer tuples...");
75 let tuples = self.extract_tuples(&spss)?;
76 info!(" Extracted and coalesced {} tuples", tuples.len());
77
78 info!("Step 3: Classifying buckets...");
80 let buckets = classify_into_buckets(tuples);
81
82 let mut stats = BucketStatistics::new();
84 for bucket in &buckets {
85 stats.add_bucket(&bucket.tuples);
86 }
87 stats.print_summary();
88
89 info!("Step 5: Building minimizers control map...");
91 let (control_map, bucket_id_by_mphf_index) = self.build_control_map(&buckets)?;
92 info!(" Built MPHF for {} minimizers", control_map.num_minimizers());
93
94 info!("Step 6: Building sparse and skew index...");
99 let mphf_order = if !bucket_id_by_mphf_index.is_empty() {
100 Some(bucket_id_by_mphf_index)
101 } else {
102 None
103 };
104 let index = self.build_index(&buckets, mphf_order.as_deref(), &spss)?;
105 info!(" Index built successfully");
106
107 info!("Dictionary Build Complete");
109 let total_bits = spss.num_bits() + control_map.num_bits() + index.num_bits();
110 info!("Total memory: {:.2} MB", total_bits as f64 / (8.0 * 1024.0 * 1024.0));
111
112 Ok(Dictionary::new(
113 spss,
114 control_map,
115 index,
116 self.config.k,
117 self.config.m,
118 self.config.canonical,
119 ))
120 }
121
122 fn encode_sequences(&self, sequences: Vec<String>) -> Result<(SpectrumPreservingStringSet, usize), String> {
124 let num_sequences = sequences.len();
125 let spss = crate::dispatch_on_k!(self.config.k, K => {
126 self.encode_sequences_k::<K>(sequences)?
127 });
128
129 Ok((spss, num_sequences))
130 }
131
132 fn encode_sequences_k<const K: usize>(&self, sequences: Vec<String>) -> Result<SpectrumPreservingStringSet, String>
134 where
135 Kmer<K>: KmerBits,
136 {
137 let mut encoder = Encoder::<K>::new();
138
139 for (idx, seq) in sequences.iter().enumerate() {
140 encoder.add_sequence(seq.as_bytes()).map_err(|e| {
141 format!("Failed to encode sequence {}: {}", idx, e)
142 })?;
143 }
144
145 Ok(encoder.build(self.config.m))
146 }
147
148 fn extract_tuples(&self, spss: &SpectrumPreservingStringSet) -> Result<Vec<crate::builder::minimizer_tuples::MinimizerTuple>, String> {
152 let total_bases = spss.total_bases();
154 let num_strings = spss.num_strings();
155 let k = self.config.k as u64;
156 let total_kmers = total_bases.saturating_sub(num_strings * (k - 1));
157
158 if needs_external_sorting(total_kmers, self.config.ram_limit_gib) {
160 info!(
161 "Using external sorting: estimated {} k-mers exceeds RAM limit of {} GiB",
162 total_kmers, self.config.ram_limit_gib
163 );
164 crate::dispatch_on_k!(self.config.k, K => {
165 compute_minimizer_tuples_external::<K>(spss, &self.config)
166 .map_err(|e| e.to_string())
167 })
168 } else {
169 crate::dispatch_on_k!(self.config.k, K => {
170 Ok(compute_minimizer_tuples::<K>(spss, &self.config))
171 })
172 }
173 }
174
175 fn build_control_map(&self, buckets: &[crate::builder::buckets::Bucket]) -> Result<(crate::minimizers_control_map::MinimizersControlMap, Vec<usize>), String> {
180 let mut builder = MinimizersControlMapBuilder::new();
181
182 for (bucket_id, bucket) in buckets.iter().enumerate() {
185 builder.add_minimizer(bucket.minimizer);
186
187 let bucket_type = match bucket.bucket_type {
188 crate::builder::buckets::BucketType::Singleton => BucketType::Regular,
189 crate::builder::buckets::BucketType::Light => BucketType::Sparse,
190 crate::builder::buckets::BucketType::Heavy => BucketType::HeavyLoad,
191 };
192
193 builder.set_bucket_type(bucket.minimizer, bucket_type);
194
195 if let Some(control) = builder.get_control_mut(bucket.minimizer) {
197 control.metadata = bucket_id as u64;
198 }
199 }
200
201 let c = 100u16;
205 let alpha = 0.94; builder.build(c, alpha).map_err(|e| {
208 format!("Failed to build minimizers control map: {}", e)
209 })
210 }
211
212 fn build_index(
214 &self,
215 buckets: &[crate::builder::buckets::Bucket],
216 mphf_order: Option<&[usize]>,
217 spss: &SpectrumPreservingStringSet,
218 ) -> Result<SparseAndSkewIndex, String> {
219 let total_bases = spss.total_bases();
223 let num_bits_per_offset = crate::constants::ceil_log2(total_bases);
224
225 let index = crate::dispatch_on_k!(self.config.k, K => {
227 SparseAndSkewIndex::build::<K>(buckets, mphf_order, num_bits_per_offset, spss, self.config.canonical)
228 });
229
230 Ok(index)
231 }
232}
233
234#[cfg(test)]
235mod tests {
236 use super::*;
237
238 #[test]
239 fn test_dictionary_builder_creation() {
240 let config = BuildConfiguration::default();
241 let builder = DictionaryBuilder::new(config);
242 assert!(builder.is_ok());
243 }
244
245 #[test]
246 fn test_dictionary_builder_invalid_config() {
247 let config = BuildConfiguration { k: 30, ..BuildConfiguration::default() }; let builder = DictionaryBuilder::new(config);
249 assert!(builder.is_err());
250 }
251
252 #[test]
253 fn test_build_simple_dictionary() {
254 let config = BuildConfiguration::new(21, 11).unwrap();
255 let builder = DictionaryBuilder::new(config).unwrap();
256
257 let sequences = vec![
258 "ACGTACGTACGTACGTACGTACGT".to_string(),
259 "TGCATGCATGCATGCATGCATGCA".to_string(),
260 ];
261
262 let dict = builder.build_from_sequences(sequences);
263 println!("Dictionary build result: {:?}", dict.is_ok());
266 }
267}