Skip to main content

sshash_lib/builder/
config.rs

1//! Build configuration for SSHash dictionary construction
2//!
3//! Mirrors the C++ `build_configuration` struct with parameters for
4//! dictionary building, minimizer computation, and resource limits.
5
6use crate::constants::DEFAULT_SEED;
7use std::path::PathBuf;
8
9/// Configuration parameters for building an SSHash dictionary
10#[derive(Debug, Clone)]
11pub struct BuildConfiguration {
12    /// K-mer length (must be odd, between 3 and 63)
13    pub k: usize,
14    
15    /// Minimizer length (must be odd, m < k)
16    pub m: usize,
17    
18    /// Seed for hash functions
19    pub seed: u64,
20    
21    /// Number of threads for parallel operations (0 = all available cores)
22    pub num_threads: usize,
23    
24    /// RAM limit in GiB for external sorting
25    pub ram_limit_gib: usize,
26    
27    /// PTHash lambda parameter (trade-off for MPHF construction)
28    /// Typically 3.5-4.0 for minimal size, higher for faster queries
29    pub lambda: f64,
30    
31    /// Build in canonical mode (k-mer and reverse-complement map to same entry)
32    pub canonical: bool,
33    
34    /// Use partitioned MPHF for parallel construction (default: true)
35    pub partitioned_mphf: bool,
36
37    /// Build weighted dictionary (with k-mer abundance/weights)
38    pub weighted: bool,
39    
40    /// Verbose output during construction
41    pub verbose: bool,
42    
43    /// Directory for temporary files during construction
44    pub tmp_dirname: PathBuf,
45}
46
47impl Default for BuildConfiguration {
48    fn default() -> Self {
49        Self {
50            k: 31,
51            m: 19,  // Must be odd, less than k
52            seed: DEFAULT_SEED,
53            num_threads: 0, // 0 = use all available cores
54            ram_limit_gib: 8,
55            lambda: 6.0,  // C++ default
56            canonical: false,
57            partitioned_mphf: true,
58            weighted: false,
59            verbose: true,
60            tmp_dirname: PathBuf::from("sshash_tmp"),
61        }
62    }
63}
64
65impl BuildConfiguration {
66    /// Create a new build configuration with the specified k-mer and minimizer lengths
67    pub fn new(k: usize, m: usize) -> Result<Self, String> {
68        let config = Self {
69            k,
70            m,
71            ..Self::default()
72        };
73        config.validate()?;
74        Ok(config)
75    }
76    
77    /// Validate the configuration parameters
78    pub fn validate(&self) -> Result<(), String> {
79        // Check k is odd and in valid range
80        if self.k % 2 == 0 {
81            return Err(format!("k must be odd, got k={}", self.k));
82        }
83        if self.k < 3 || self.k > 63 {
84            return Err(format!("k must be in range [3, 63], got k={}", self.k));
85        }
86        
87        // Check m is less than k
88        if self.m >= self.k {
89            return Err(format!("m must be less than k, got m={}, k={}", self.m, self.k));
90        }
91        
92        // Check lambda is reasonable
93        if self.lambda < 1.0 || self.lambda > 100.0 {
94            return Err(format!("lambda should be in range [1.0, 100.0], got {}", self.lambda));
95        }
96        
97        Ok(())
98    }
99    
100    /// Log configuration parameters via tracing
101    pub fn print(&self) {
102        tracing::info!("Build Configuration:");
103        tracing::info!("  k = {}", self.k);
104        tracing::info!("  m = {}", self.m);
105        tracing::debug!("  seed = {}", self.seed);
106        if self.num_threads == 0 {
107            tracing::info!("  num_threads = all available cores");
108        } else {
109            tracing::info!("  num_threads = {}", self.num_threads);
110        }
111        tracing::debug!("  ram_limit_gib = {}", self.ram_limit_gib);
112        tracing::debug!("  lambda = {}", self.lambda);
113        tracing::info!("  canonical = {}", self.canonical);
114        tracing::debug!("  weighted = {}", self.weighted);
115        tracing::debug!("  verbose = {}", self.verbose);
116        tracing::debug!("  tmp_dirname = {:?}", self.tmp_dirname);
117    }
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123    
124    #[test]
125    fn test_default_config() {
126        let config = BuildConfiguration::default();
127        assert_eq!(config.k, 31);
128        assert_eq!(config.m, 19);
129        assert!(config.validate().is_ok());
130    }
131    
132    #[test]
133    fn test_new_config() {
134        let config = BuildConfiguration::new(21, 11).unwrap();
135        assert_eq!(config.k, 21);
136        assert_eq!(config.m, 11);
137    }
138    
139    #[test]
140    fn test_validate_even_k() {
141        let config = BuildConfiguration { k: 30, ..BuildConfiguration::default() };
142        assert!(config.validate().is_err());
143    }
144    
145    #[test]
146    fn test_validate_even_m() {
147        let config = BuildConfiguration { m: 20, ..BuildConfiguration::default() };
148        assert!(config.validate().is_ok());
149    }
150    
151    #[test]
152    fn test_validate_m_ge_k() {
153        let config = BuildConfiguration { k: 21, m: 21, ..BuildConfiguration::default() };
154        assert!(config.validate().is_err());
155    }
156    
157    #[test]
158    fn test_validate_k_out_of_range() {
159        let config = BuildConfiguration { k: 65, ..BuildConfiguration::default() };
160        assert!(config.validate().is_err());
161        
162        let config = BuildConfiguration { k: 1, ..BuildConfiguration::default() };
163        assert!(config.validate().is_err());
164    }
165}