Skip to main content

sshash_lib/builder/
dictionary_builder.rs

1//! Dictionary builder orchestration
2//!
3//! Coordinates the multi-step pipeline to build an SSHash dictionary:
4//! 1. Parse and encode sequences into SPSS
5//! 2. Extract minimizer tuples
6//! 3. Classify buckets
7//! 4. Build minimizers control map
8//! 5. Build sparse and skew index
9//! 6. Assemble final dictionary
10
11use crate::{
12    builder::{
13        buckets::{classify_into_buckets, BucketStatistics},
14        config::BuildConfiguration,
15        encode::Encoder,
16        minimizer_tuples::{compute_minimizer_tuples, compute_minimizer_tuples_external, needs_external_sorting},
17    },
18    dictionary::Dictionary,
19    kmer::{Kmer, KmerBits},
20    minimizers_control_map::{MinimizersControlMapBuilder, BucketType},
21    sparse_and_skew_index::SparseAndSkewIndex,
22    spectrum_preserving_string_set::SpectrumPreservingStringSet,
23};
24use tracing::info;
25
26/// Builder for constructing SSHash dictionaries
27pub struct DictionaryBuilder {
28    config: BuildConfiguration,
29}
30
31impl DictionaryBuilder {
32    /// Create a new dictionary builder with the given configuration
33    pub fn new(config: BuildConfiguration) -> Result<Self, String> {
34        config.validate()?;
35        Ok(Self { config })
36    }
37    
38    /// Build a dictionary from input sequences
39    ///
40    /// # Arguments
41    /// * `sequences` - Vector of DNA sequences (strings)
42    ///
43    /// # Parallelism
44    /// The number of threads is controlled by `config.num_threads`:
45    /// - `0` — use all available CPU cores (rayon default)
46    /// - `1` — single-threaded (no rayon overhead)
47    /// - `N` — use exactly N threads
48    ///
49    /// # Returns
50    /// A fully constructed Dictionary ready for queries
51    pub fn build_from_sequences(&self, sequences: Vec<String>) -> Result<Dictionary, String> {
52        // Build a rayon thread pool sized to config.num_threads.
53        // num_threads == 0 means "all cores" (rayon default).
54        let pool = rayon::ThreadPoolBuilder::new()
55            .num_threads(self.config.num_threads)
56            .build()
57            .map_err(|e| format!("Failed to create thread pool: {e}"))?;
58
59        pool.install(|| self.build_from_sequences_inner(sequences))
60    }
61
62    /// Inner build logic, runs inside the rayon thread pool
63    fn build_from_sequences_inner(&self, sequences: Vec<String>) -> Result<Dictionary, String> {
64        self.config.print();
65        info!("Building SSHash Dictionary");
66        
67        // Step 1: Encode sequences into SPSS
68        info!("Step 1: Encoding sequences...");
69        let (spss, num_sequences) = self.encode_sequences(sequences)?;
70        info!("  Encoded {} sequences", num_sequences);
71        info!("  Total bases: {}", spss.total_bases());
72        
73        // Step 2: Extract minimizer tuples (with inline coalescing during extraction)
74        info!("Step 2: Extracting and coalescing minimizer tuples...");
75        let tuples = self.extract_tuples(&spss)?;
76        info!("  Extracted and coalesced {} tuples", tuples.len());
77        
78        // Step 3: Classify into buckets
79        info!("Step 3: Classifying buckets...");
80        let buckets = classify_into_buckets(tuples);
81        
82        // Compute statistics
83        let mut stats = BucketStatistics::new();
84        for bucket in &buckets {
85            stats.add_bucket(&bucket.tuples);
86        }
87        stats.print_summary();
88        
89        // Step 5: Build minimizers control map
90        info!("Step 5: Building minimizers control map...");
91        let (control_map, bucket_id_by_mphf_index) = self.build_control_map(&buckets)?;
92        info!("  Built MPHF for {} minimizers", control_map.num_minimizers());
93        
94        // Step 6: Build sparse and skew index using indirect MPHF-order iteration.
95        // Instead of physically reordering 364M+ Bucket objects (~30GB for large
96        // datasets), we pass the MPHF→original index mapping and iterate in
97        // MPHF order via indirection.
98        info!("Step 6: Building sparse and skew index...");
99        let mphf_order = if !bucket_id_by_mphf_index.is_empty() {
100            Some(bucket_id_by_mphf_index)
101        } else {
102            None
103        };
104        let index = self.build_index(&buckets, mphf_order.as_deref(), &spss)?;
105        info!("  Index built successfully");
106        
107        // Step 7: Assemble dictionary
108        info!("Dictionary Build Complete");
109        let total_bits = spss.num_bits() + control_map.num_bits() + index.num_bits();
110        info!("Total memory: {:.2} MB", total_bits as f64 / (8.0 * 1024.0 * 1024.0));
111        
112        Ok(Dictionary::new(
113            spss,
114            control_map,
115            index,
116            self.config.k,
117            self.config.m,
118            self.config.canonical,
119        ))
120    }
121    
122    /// Encode sequences into spectrum-preserving string set
123    fn encode_sequences(&self, sequences: Vec<String>) -> Result<(SpectrumPreservingStringSet, usize), String> {
124        let num_sequences = sequences.len();
125        let spss = crate::dispatch_on_k!(self.config.k, K => {
126            self.encode_sequences_k::<K>(sequences)?
127        });
128        
129        Ok((spss, num_sequences))
130    }
131    
132    /// Encode sequences with specific K
133    fn encode_sequences_k<const K: usize>(&self, sequences: Vec<String>) -> Result<SpectrumPreservingStringSet, String>
134    where
135        Kmer<K>: KmerBits,
136    {
137        let mut encoder = Encoder::<K>::new();
138        
139        for (idx, seq) in sequences.iter().enumerate() {
140            encoder.add_sequence(seq.as_bytes()).map_err(|e| {
141                format!("Failed to encode sequence {}: {}", idx, e)
142            })?;
143        }
144        
145        Ok(encoder.build(self.config.m))
146    }
147    
148    /// Extract minimizer tuples from SPSS
149    /// 
150    /// Uses external sorting when the estimated memory exceeds the RAM limit.
151    fn extract_tuples(&self, spss: &SpectrumPreservingStringSet) -> Result<Vec<crate::builder::minimizer_tuples::MinimizerTuple>, String> {
152        // Estimate total k-mers: total_bases - num_strings * (k - 1)
153        let total_bases = spss.total_bases();
154        let num_strings = spss.num_strings();
155        let k = self.config.k as u64;
156        let total_kmers = total_bases.saturating_sub(num_strings * (k - 1));
157        
158        // Check if we need external sorting
159        if needs_external_sorting(total_kmers, self.config.ram_limit_gib) {
160            info!(
161                "Using external sorting: estimated {} k-mers exceeds RAM limit of {} GiB",
162                total_kmers, self.config.ram_limit_gib
163            );
164            crate::dispatch_on_k!(self.config.k, K => {
165                compute_minimizer_tuples_external::<K>(spss, &self.config)
166                    .map_err(|e| e.to_string())
167            })
168        } else {
169            crate::dispatch_on_k!(self.config.k, K => {
170                Ok(compute_minimizer_tuples::<K>(spss, &self.config))
171            })
172        }
173    }
174    
175    /// Build the minimizers control map from buckets
176    ///
177    /// Returns the control map AND a mapping from MPHF index to bucket_id
178    /// for reordering control_codewords to MPHF order.
179    fn build_control_map(&self, buckets: &[crate::builder::buckets::Bucket]) -> Result<(crate::minimizers_control_map::MinimizersControlMap, Vec<usize>), String> {
180        let mut builder = MinimizersControlMapBuilder::new();
181        
182        // Add all minimizers and set their bucket types
183        // The bucket index is implicitly the order in which we add them
184        for (bucket_id, bucket) in buckets.iter().enumerate() {
185            builder.add_minimizer(bucket.minimizer);
186            
187            let bucket_type = match bucket.bucket_type {
188                crate::builder::buckets::BucketType::Singleton => BucketType::Regular,
189                crate::builder::buckets::BucketType::Light => BucketType::Sparse,
190                crate::builder::buckets::BucketType::Heavy => BucketType::HeavyLoad,
191            };
192            
193            builder.set_bucket_type(bucket.minimizer, bucket_type);
194            
195            // Store bucket_id in metadata so we can map MPHF index → bucket_id
196            if let Some(control) = builder.get_control_mut(bucket.minimizer) {
197                control.metadata = bucket_id as u64;
198            }
199        }
200        
201        // Build the MPHF
202        // Use relative_level_size = 100 for minimum MPHF size (~2.8 bits/key)
203        // Note: C++ lambda controls PTHash partition size and is unrelated to fmph's level size
204        let c = 100u16;
205        let alpha = 0.94; // Load factor (not used by ph, but kept for API compatibility)
206        
207        builder.build(c, alpha).map_err(|e| {
208            format!("Failed to build minimizers control map: {}", e)
209        })
210    }
211    
212    /// Build the sparse and skew index
213    fn build_index(
214        &self,
215        buckets: &[crate::builder::buckets::Bucket],
216        mphf_order: Option<&[usize]>,
217        spss: &SpectrumPreservingStringSet,
218    ) -> Result<SparseAndSkewIndex, String> {
219        // Calculate offset encoding bits (matching C++ decoded_offsets)
220        // pos_in_seq stores absolute position in concatenated SPSS
221        // So num_bits_per_offset = ceil_log2(total_bases)
222        let total_bases = spss.total_bases();
223        let num_bits_per_offset = crate::constants::ceil_log2(total_bases);
224
225        // Dispatch to appropriate build function based on k
226        let index = crate::dispatch_on_k!(self.config.k, K => {
227            SparseAndSkewIndex::build::<K>(buckets, mphf_order, num_bits_per_offset, spss, self.config.canonical)
228        });
229
230        Ok(index)
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    
238    #[test]
239    fn test_dictionary_builder_creation() {
240        let config = BuildConfiguration::default();
241        let builder = DictionaryBuilder::new(config);
242        assert!(builder.is_ok());
243    }
244    
245    #[test]
246    fn test_dictionary_builder_invalid_config() {
247        let config = BuildConfiguration { k: 30, ..BuildConfiguration::default() }; // Even k is invalid
248        let builder = DictionaryBuilder::new(config);
249        assert!(builder.is_err());
250    }
251    
252    #[test]
253    fn test_build_simple_dictionary() {
254        let config = BuildConfiguration::new(21, 11).unwrap();
255        let builder = DictionaryBuilder::new(config).unwrap();
256        
257        let sequences = vec![
258            "ACGTACGTACGTACGTACGTACGT".to_string(),
259            "TGCATGCATGCATGCATGCATGCA".to_string(),
260        ];
261        
262        let dict = builder.build_from_sequences(sequences);
263        // Note: This test may fail until we have proper k-mer extraction
264        // in the build pipeline. For now, just check that it runs.
265        println!("Dictionary build result: {:?}", dict.is_ok());
266    }
267}