base_d/core/
config.rs

1use serde::Deserialize;
2use std::collections::HashMap;
3
4/// Encoding strategy for converting binary data to text.
5///
6/// Different modes offer different tradeoffs between efficiency, compatibility,
7/// and features.
8#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "snake_case")]
10#[derive(Default)]
11pub enum EncodingMode {
12    /// True radix/base conversion treating data as a large number.
13    /// Works with any dictionary size. Output length varies with input.
14    /// Requires entire input before producing output (not streamable).
15    #[default]
16    #[serde(alias = "base_conversion")]
17    Radix,
18    /// Fixed-size bit chunking per RFC 4648.
19    /// Requires power-of-two dictionary size. Supports padding.
20    Chunked,
21    /// Direct 1:1 byte-to-character mapping using Unicode codepoint ranges.
22    /// Zero encoding overhead. Always 256 characters.
23    ByteRange,
24}
25
26/// Configuration for a single dictionary loaded from TOML.
27#[derive(Debug, Deserialize, Clone)]
28pub struct DictionaryConfig {
29    /// The characters comprising the dictionary (explicit list)
30    #[serde(default)]
31    pub chars: String,
32    /// Starting character for range-based dictionary definition
33    /// Use with `length` to define sequential Unicode ranges
34    #[serde(default)]
35    pub start: Option<String>,
36    /// Number of characters in range-based dictionary
37    /// Use with `start` to define sequential Unicode ranges
38    #[serde(default)]
39    pub length: Option<usize>,
40    /// The encoding mode to use (auto-detected if not specified)
41    #[serde(default)]
42    pub mode: Option<EncodingMode>,
43    /// Optional padding character (e.g., "=" for base64)
44    #[serde(default)]
45    pub padding: Option<String>,
46    /// Starting Unicode codepoint for ByteRange mode (256 chars)
47    #[serde(default)]
48    pub start_codepoint: Option<u32>,
49    /// Whether this dictionary renders consistently across platforms (default: true)
50    /// Dictionaries with common=false are excluded from random selection (--dejavu)
51    #[serde(default = "default_true")]
52    pub common: bool,
53}
54
55impl DictionaryConfig {
56    /// Returns the effective character set, generating from range if needed.
57    ///
58    /// Priority:
59    /// 1. If `chars` is non-empty, use it directly
60    /// 2. If `start` + `length` are set, generate sequential range
61    /// 3. Otherwise return empty string (ByteRange mode uses start_codepoint instead)
62    pub fn effective_chars(&self) -> Result<String, String> {
63        // Explicit chars take priority
64        if !self.chars.is_empty() {
65            return Ok(self.chars.clone());
66        }
67
68        // Generate from start + length range
69        if let (Some(start_str), Some(length)) = (&self.start, self.length) {
70            let start_char = start_str
71                .chars()
72                .next()
73                .ok_or("start must contain at least one character")?;
74            let start_codepoint = start_char as u32;
75
76            return Self::generate_range(start_codepoint, length);
77        }
78
79        // No chars defined - might be ByteRange mode
80        Ok(String::new())
81    }
82
83    /// Generate a string of sequential Unicode characters from a range.
84    fn generate_range(start: u32, length: usize) -> Result<String, String> {
85        const MAX_UNICODE: u32 = 0x10FFFF;
86        const SURROGATE_START: u32 = 0xD800;
87        const SURROGATE_END: u32 = 0xDFFF;
88
89        if length == 0 {
90            return Err("length must be greater than 0".to_string());
91        }
92
93        let end = start
94            .checked_add(length as u32 - 1)
95            .ok_or("range exceeds maximum Unicode codepoint")?;
96
97        if end > MAX_UNICODE {
98            return Err(format!(
99                "range end U+{:X} exceeds maximum Unicode codepoint U+{:X}",
100                end, MAX_UNICODE
101            ));
102        }
103
104        // Check for surrogate gap crossing
105        let crosses_surrogates = start <= SURROGATE_END && end >= SURROGATE_START;
106        if crosses_surrogates {
107            return Err(format!(
108                "range U+{:X}..U+{:X} crosses surrogate gap (U+D800..U+DFFF)",
109                start, end
110            ));
111        }
112
113        let mut result = String::with_capacity(length * 4); // UTF-8 worst case
114        for i in 0..length {
115            let codepoint = start + i as u32;
116            match char::from_u32(codepoint) {
117                Some(c) => result.push(c),
118                None => return Err(format!("invalid codepoint U+{:X}", codepoint)),
119            }
120        }
121
122        Ok(result)
123    }
124
125    /// Returns the effective encoding mode, auto-detecting if not explicitly set.
126    ///
127    /// Auto-detection rules:
128    /// - ByteRange: Must be explicitly set (requires start_codepoint)
129    /// - Chunked: If alphabet length is a power of 2
130    /// - Radix: Otherwise (true base conversion)
131    pub fn effective_mode(&self) -> EncodingMode {
132        if let Some(mode) = &self.mode {
133            return mode.clone();
134        }
135
136        // Auto-detect based on alphabet length
137        let len = if self.start_codepoint.is_some() {
138            // ByteRange must be explicit, but if someone sets start_codepoint
139            // without mode, assume they want ByteRange
140            return EncodingMode::ByteRange;
141        } else if let Some(length) = self.length {
142            // Range-based definition
143            length
144        } else {
145            self.chars.chars().count()
146        };
147
148        if len > 0 && len.is_power_of_two() {
149            EncodingMode::Chunked
150        } else {
151            EncodingMode::Radix
152        }
153    }
154}
155
156fn default_true() -> bool {
157    true
158}
159
160/// Collection of dictionary configurations loaded from TOML files.
161#[derive(Debug, Deserialize)]
162pub struct DictionaryRegistry {
163    /// Map of dictionary names to their configurations
164    pub dictionaries: HashMap<String, DictionaryConfig>,
165    /// Compression algorithm configurations
166    #[serde(default)]
167    pub compression: HashMap<String, CompressionConfig>,
168    /// Global settings
169    #[serde(default)]
170    pub settings: Settings,
171}
172
173/// Configuration for a compression algorithm.
174#[derive(Debug, Deserialize, Clone)]
175pub struct CompressionConfig {
176    /// Default compression level
177    pub default_level: u32,
178}
179
180/// xxHash-specific settings.
181#[derive(Debug, Deserialize, Clone, Default)]
182pub struct XxHashSettings {
183    /// Default seed for xxHash algorithms
184    #[serde(default)]
185    pub default_seed: u64,
186    /// Path to default secret file for XXH3 variants
187    #[serde(default)]
188    pub default_secret_file: Option<String>,
189}
190
191/// Global settings for base-d.
192#[derive(Debug, Deserialize, Clone, Default)]
193pub struct Settings {
194    /// Default dictionary - if not set, requires explicit -e or --dejavu
195    #[serde(default)]
196    pub default_dictionary: Option<String>,
197    /// xxHash configuration
198    #[serde(default)]
199    pub xxhash: XxHashSettings,
200}
201
202impl DictionaryRegistry {
203    /// Parses dictionary configurations from TOML content.
204    pub fn from_toml(content: &str) -> Result<Self, toml::de::Error> {
205        toml::from_str(content)
206    }
207
208    /// Loads the built-in dictionary configurations.
209    ///
210    /// Returns the default dictionaries bundled with the library.
211    pub fn load_default() -> Result<Self, Box<dyn std::error::Error>> {
212        let content = include_str!("../../dictionaries.toml");
213        Ok(Self::from_toml(content)?)
214    }
215
216    /// Loads configuration from a custom file path.
217    pub fn load_from_file(path: &std::path::Path) -> Result<Self, Box<dyn std::error::Error>> {
218        let content = std::fs::read_to_string(path)?;
219        Ok(Self::from_toml(&content)?)
220    }
221
222    /// Loads configuration with user overrides from standard locations.
223    ///
224    /// Searches in priority order:
225    /// 1. Built-in dictionaries (from library)
226    /// 2. `~/.config/base-d/dictionaries.toml` (user overrides)
227    /// 3. `./dictionaries.toml` (project-local overrides)
228    ///
229    /// Later configurations override earlier ones for matching dictionary names.
230    pub fn load_with_overrides() -> Result<Self, Box<dyn std::error::Error>> {
231        let mut config = Self::load_default()?;
232
233        // Try to load user config from ~/.config/base-d/dictionaries.toml
234        if let Some(config_dir) = dirs::config_dir() {
235            let user_config_path = config_dir.join("base-d").join("dictionaries.toml");
236            if user_config_path.exists() {
237                match Self::load_from_file(&user_config_path) {
238                    Ok(user_config) => {
239                        config.merge(user_config);
240                    }
241                    Err(e) => {
242                        eprintln!(
243                            "Warning: Failed to load user config from {:?}: {}",
244                            user_config_path, e
245                        );
246                    }
247                }
248            }
249        }
250
251        // Try to load local config from ./dictionaries.toml
252        let local_config_path = std::path::Path::new("dictionaries.toml");
253        if local_config_path.exists() {
254            match Self::load_from_file(local_config_path) {
255                Ok(local_config) => {
256                    config.merge(local_config);
257                }
258                Err(e) => {
259                    eprintln!(
260                        "Warning: Failed to load local config from {:?}: {}",
261                        local_config_path, e
262                    );
263                }
264            }
265        }
266
267        Ok(config)
268    }
269
270    /// Merges another configuration into this one.
271    ///
272    /// Dictionaries from `other` override dictionaries with the same name in `self`.
273    pub fn merge(&mut self, other: DictionaryRegistry) {
274        for (name, dictionary) in other.dictionaries {
275            self.dictionaries.insert(name, dictionary);
276        }
277    }
278
279    /// Retrieves an dictionary configuration by name.
280    pub fn get_dictionary(&self, name: &str) -> Option<&DictionaryConfig> {
281        self.dictionaries.get(name)
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn test_load_default_config() {
291        let config = DictionaryRegistry::load_default().unwrap();
292        assert!(config.dictionaries.contains_key("cards"));
293    }
294
295    #[test]
296    fn test_cards_dictionary_length() {
297        let config = DictionaryRegistry::load_default().unwrap();
298        let cards = config.get_dictionary("cards").unwrap();
299        assert_eq!(cards.chars.chars().count(), 52);
300    }
301
302    #[test]
303    fn test_base64_chunked_mode() {
304        let config = DictionaryRegistry::load_default().unwrap();
305        let base64 = config.get_dictionary("base64").unwrap();
306        assert_eq!(base64.effective_mode(), EncodingMode::Chunked);
307        assert_eq!(base64.padding, Some("=".to_string()));
308    }
309
310    #[test]
311    fn test_base64_radix_mode() {
312        let config = DictionaryRegistry::load_default().unwrap();
313        let base64_radix = config.get_dictionary("base64_radix").unwrap();
314        assert_eq!(base64_radix.effective_mode(), EncodingMode::Radix);
315    }
316
317    #[test]
318    fn test_auto_detection_power_of_two() {
319        // Power of 2 → Chunked
320        let config = DictionaryConfig {
321            chars: "ABCD".to_string(), // 4 = 2^2
322            mode: None,
323            padding: None,
324            start_codepoint: None,
325            start: None,
326            length: None,
327            common: true,
328        };
329        assert_eq!(config.effective_mode(), EncodingMode::Chunked);
330
331        // Not power of 2 → Radix
332        let config = DictionaryConfig {
333            chars: "ABC".to_string(), // 3 ≠ 2^n
334            mode: None,
335            padding: None,
336            start_codepoint: None,
337            start: None,
338            length: None,
339            common: true,
340        };
341        assert_eq!(config.effective_mode(), EncodingMode::Radix);
342    }
343
344    #[test]
345    fn test_explicit_mode_override() {
346        // Explicit mode overrides auto-detection
347        let config = DictionaryConfig {
348            chars: "ABCD".to_string(),       // Would be Chunked
349            mode: Some(EncodingMode::Radix), // But explicitly set to Radix
350            padding: None,
351            start_codepoint: None,
352            start: None,
353            length: None,
354            common: true,
355        };
356        assert_eq!(config.effective_mode(), EncodingMode::Radix);
357    }
358
359    #[test]
360    fn test_merge_configs() {
361        let mut config1 = DictionaryRegistry {
362            dictionaries: HashMap::new(),
363            compression: HashMap::new(),
364            settings: Settings::default(),
365        };
366        config1.dictionaries.insert(
367            "test1".to_string(),
368            DictionaryConfig {
369                chars: "ABC".to_string(),
370                mode: Some(EncodingMode::Radix),
371                padding: None,
372                start_codepoint: None,
373                start: None,
374                length: None,
375                common: true,
376            },
377        );
378
379        let mut config2 = DictionaryRegistry {
380            dictionaries: HashMap::new(),
381            compression: HashMap::new(),
382            settings: Settings::default(),
383        };
384        config2.dictionaries.insert(
385            "test2".to_string(),
386            DictionaryConfig {
387                chars: "XYZ".to_string(),
388                mode: Some(EncodingMode::Radix),
389                padding: None,
390                start_codepoint: None,
391                start: None,
392                length: None,
393                common: true,
394            },
395        );
396        config2.dictionaries.insert(
397            "test1".to_string(),
398            DictionaryConfig {
399                chars: "DEF".to_string(),
400                mode: Some(EncodingMode::Radix),
401                padding: None,
402                start_codepoint: None,
403                start: None,
404                length: None,
405                common: true,
406            },
407        );
408
409        config1.merge(config2);
410
411        assert_eq!(config1.dictionaries.len(), 2);
412        assert_eq!(config1.get_dictionary("test1").unwrap().chars, "DEF");
413        assert_eq!(config1.get_dictionary("test2").unwrap().chars, "XYZ");
414    }
415
416    #[test]
417    fn test_load_from_toml_string() {
418        let toml_content = r#"
419[dictionaries.custom]
420chars = "0123456789"
421mode = "base_conversion"
422"#;
423        let config = DictionaryRegistry::from_toml(toml_content).unwrap();
424        assert!(config.dictionaries.contains_key("custom"));
425        assert_eq!(config.get_dictionary("custom").unwrap().chars, "0123456789");
426    }
427
428    #[test]
429    fn test_effective_chars_from_explicit() {
430        let config = DictionaryConfig {
431            chars: "ABCD".to_string(),
432            mode: None,
433            padding: None,
434            start_codepoint: None,
435            start: None,
436            length: None,
437            common: true,
438        };
439        assert_eq!(config.effective_chars().unwrap(), "ABCD");
440    }
441
442    #[test]
443    fn test_effective_chars_from_range() {
444        let config = DictionaryConfig {
445            chars: String::new(),
446            mode: None,
447            padding: None,
448            start_codepoint: None,
449            start: Some("A".to_string()),
450            length: Some(4),
451            common: true,
452        };
453        assert_eq!(config.effective_chars().unwrap(), "ABCD");
454    }
455
456    #[test]
457    fn test_effective_chars_explicit_takes_priority() {
458        // Explicit chars should override start+length
459        let config = DictionaryConfig {
460            chars: "XYZ".to_string(),
461            mode: None,
462            padding: None,
463            start_codepoint: None,
464            start: Some("A".to_string()),
465            length: Some(4),
466            common: true,
467        };
468        assert_eq!(config.effective_chars().unwrap(), "XYZ");
469    }
470
471    #[test]
472    fn test_effective_chars_unicode_range() {
473        // Test generating a range starting from a Unicode character
474        let config = DictionaryConfig {
475            chars: String::new(),
476            mode: None,
477            padding: None,
478            start_codepoint: None,
479            start: Some("가".to_string()), // Korean Hangul U+AC00
480            length: Some(4),
481            common: true,
482        };
483        let result = config.effective_chars().unwrap();
484        assert_eq!(result.chars().count(), 4);
485        assert_eq!(result, "가각갂갃");
486    }
487
488    #[test]
489    fn test_effective_chars_surrogate_gap_error() {
490        // Range crossing surrogate gap should error
491        let config = DictionaryConfig {
492            chars: String::new(),
493            mode: None,
494            padding: None,
495            start_codepoint: None,
496            start: Some("\u{D700}".to_string()), // Just before surrogates
497            length: Some(512),                   // Would cross into surrogate range
498            common: true,
499        };
500        assert!(config.effective_chars().is_err());
501    }
502
503    #[test]
504    fn test_effective_chars_exceeds_unicode_max() {
505        // Range exceeding max Unicode should error
506        let config = DictionaryConfig {
507            chars: String::new(),
508            mode: None,
509            padding: None,
510            start_codepoint: None,
511            start: Some("\u{10FFFE}".to_string()), // Near end of Unicode
512            length: Some(10),                      // Would exceed U+10FFFF
513            common: true,
514        };
515        assert!(config.effective_chars().is_err());
516    }
517
518    #[test]
519    fn test_effective_mode_with_length_field() {
520        // Auto-detect should use length field when chars is empty
521        let config = DictionaryConfig {
522            chars: String::new(),
523            mode: None,
524            padding: None,
525            start_codepoint: None,
526            start: Some("A".to_string()),
527            length: Some(64), // 64 = 2^6 → Chunked
528            common: true,
529        };
530        assert_eq!(config.effective_mode(), EncodingMode::Chunked);
531
532        let config = DictionaryConfig {
533            chars: String::new(),
534            mode: None,
535            padding: None,
536            start_codepoint: None,
537            start: Some("A".to_string()),
538            length: Some(52), // 52 ≠ 2^n → Radix
539            common: true,
540        };
541        assert_eq!(config.effective_mode(), EncodingMode::Radix);
542    }
543}