base_d/core/
config.rs

1use serde::Deserialize;
2use std::collections::HashMap;
3
4/// Encoding strategy for converting binary data to text.
5///
6/// Different modes offer different tradeoffs between efficiency, compatibility,
7/// and features.
8#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
9#[serde(rename_all = "snake_case")]
10#[derive(Default)]
11pub enum EncodingMode {
12    /// True radix/base conversion treating data as a large number.
13    /// Works with any dictionary size. Output length varies with input.
14    /// Requires entire input before producing output (not streamable).
15    #[default]
16    #[serde(alias = "base_conversion")]
17    Radix,
18    /// Fixed-size bit chunking per RFC 4648.
19    /// Requires power-of-two dictionary size. Supports padding.
20    Chunked,
21    /// Direct 1:1 byte-to-character mapping using Unicode codepoint ranges.
22    /// Zero encoding overhead. Always 256 characters.
23    ByteRange,
24}
25
26/// Configuration for a single dictionary loaded from TOML.
27#[derive(Debug, Deserialize, Clone)]
28pub struct DictionaryConfig {
29    /// The characters comprising the dictionary (explicit list)
30    #[serde(default)]
31    pub chars: String,
32    /// Starting character for range-based dictionary definition
33    /// Use with `length` to define sequential Unicode ranges
34    #[serde(default)]
35    pub start: Option<String>,
36    /// Number of characters in range-based dictionary
37    /// Use with `start` to define sequential Unicode ranges
38    #[serde(default)]
39    pub length: Option<usize>,
40    /// The encoding mode to use (auto-detected if not specified)
41    #[serde(default)]
42    pub mode: Option<EncodingMode>,
43    /// Optional padding character (e.g., "=" for base64)
44    #[serde(default)]
45    pub padding: Option<String>,
46    /// Starting Unicode codepoint for ByteRange mode (256 chars)
47    #[serde(default)]
48    pub start_codepoint: Option<u32>,
49    /// Whether this dictionary renders consistently across platforms (default: true)
50    /// Dictionaries with common=false are excluded from random selection (--dejavu)
51    #[serde(default = "default_true")]
52    pub common: bool,
53}
54
55impl DictionaryConfig {
56    /// Returns the effective character set, generating from range if needed.
57    ///
58    /// Priority:
59    /// 1. If `chars` is non-empty, use it directly
60    /// 2. If `start` + `length` are set, generate sequential range
61    /// 3. Otherwise return empty string (ByteRange mode uses start_codepoint instead)
62    pub fn effective_chars(&self) -> Result<String, String> {
63        // Explicit chars take priority
64        if !self.chars.is_empty() {
65            return Ok(self.chars.clone());
66        }
67
68        // Generate from start + length range
69        if let (Some(start_str), Some(length)) = (&self.start, self.length) {
70            let start_char = start_str
71                .chars()
72                .next()
73                .ok_or("start must contain at least one character")?;
74            let start_codepoint = start_char as u32;
75
76            return Self::generate_range(start_codepoint, length);
77        }
78
79        // No chars defined - might be ByteRange mode
80        Ok(String::new())
81    }
82
83    /// Generate a string of sequential Unicode characters from a range.
84    fn generate_range(start: u32, length: usize) -> Result<String, String> {
85        const MAX_UNICODE: u32 = 0x10FFFF;
86        const SURROGATE_START: u32 = 0xD800;
87        const SURROGATE_END: u32 = 0xDFFF;
88
89        if length == 0 {
90            return Err("length must be greater than 0".to_string());
91        }
92
93        let end = start
94            .checked_add(length as u32 - 1)
95            .ok_or("range exceeds maximum Unicode codepoint")?;
96
97        if end > MAX_UNICODE {
98            return Err(format!(
99                "range end U+{:X} exceeds maximum Unicode codepoint U+{:X}",
100                end, MAX_UNICODE
101            ));
102        }
103
104        // Check for surrogate gap crossing
105        let crosses_surrogates = start <= SURROGATE_END && end >= SURROGATE_START;
106        if crosses_surrogates {
107            return Err(format!(
108                "range U+{:X}..U+{:X} crosses surrogate gap (U+D800..U+DFFF)",
109                start, end
110            ));
111        }
112
113        let mut result = String::with_capacity(length * 4); // UTF-8 worst case
114        for i in 0..length {
115            let codepoint = start + i as u32;
116            match char::from_u32(codepoint) {
117                Some(c) => result.push(c),
118                None => return Err(format!("invalid codepoint U+{:X}", codepoint)),
119            }
120        }
121
122        Ok(result)
123    }
124
125    /// Returns the effective encoding mode, auto-detecting if not explicitly set.
126    ///
127    /// Auto-detection rules:
128    /// - ByteRange: Must be explicitly set (requires start_codepoint)
129    /// - Chunked: If alphabet length is a power of 2
130    /// - Radix: Otherwise (true base conversion)
131    pub fn effective_mode(&self) -> EncodingMode {
132        if let Some(mode) = &self.mode {
133            return mode.clone();
134        }
135
136        // Auto-detect based on alphabet length
137        let len = if self.start_codepoint.is_some() {
138            // ByteRange must be explicit, but if someone sets start_codepoint
139            // without mode, assume they want ByteRange
140            return EncodingMode::ByteRange;
141        } else if let Some(length) = self.length {
142            // Range-based definition
143            length
144        } else {
145            self.chars.chars().count()
146        };
147
148        if len > 0 && len.is_power_of_two() {
149            EncodingMode::Chunked
150        } else {
151            EncodingMode::Radix
152        }
153    }
154}
155
156fn default_true() -> bool {
157    true
158}
159
160/// Collection of dictionary configurations loaded from TOML files.
161#[derive(Debug, Deserialize)]
162pub struct DictionaryRegistry {
163    /// Map of dictionary names to their configurations
164    pub dictionaries: HashMap<String, DictionaryConfig>,
165    /// Compression algorithm configurations
166    #[serde(default)]
167    pub compression: HashMap<String, CompressionConfig>,
168    /// Global settings
169    #[serde(default)]
170    pub settings: Settings,
171}
172
173/// Configuration for a compression algorithm.
174#[derive(Debug, Deserialize, Clone)]
175pub struct CompressionConfig {
176    /// Default compression level
177    pub default_level: u32,
178}
179
180/// xxHash-specific settings.
181#[derive(Debug, Deserialize, Clone, Default)]
182pub struct XxHashSettings {
183    /// Default seed for xxHash algorithms
184    #[serde(default)]
185    pub default_seed: u64,
186    /// Path to default secret file for XXH3 variants
187    #[serde(default)]
188    pub default_secret_file: Option<String>,
189}
190
191/// Global settings for base-d.
192#[derive(Debug, Deserialize, Clone, Default)]
193pub struct Settings {
194    /// Default dictionary - if not set, requires explicit -e or --dejavu
195    #[serde(default)]
196    pub default_dictionary: Option<String>,
197    /// xxHash configuration
198    #[serde(default)]
199    pub xxhash: XxHashSettings,
200}
201
202impl DictionaryRegistry {
203    /// Parses dictionary configurations from TOML content.
204    pub fn from_toml(content: &str) -> Result<Self, toml::de::Error> {
205        toml::from_str(content)
206    }
207
208    /// Loads the built-in dictionary configurations.
209    ///
210    /// Returns the default dictionaries bundled with the library.
211    pub fn load_default() -> Result<Self, Box<dyn std::error::Error>> {
212        let content = include_str!("../../dictionaries.toml");
213        Ok(Self::from_toml(content)?)
214    }
215
216    /// Loads configuration from a custom file path.
217    pub fn load_from_file(path: &std::path::Path) -> Result<Self, Box<dyn std::error::Error>> {
218        let content = std::fs::read_to_string(path)?;
219        Ok(Self::from_toml(&content)?)
220    }
221
222    /// Loads configuration with user overrides from standard locations.
223    ///
224    /// Searches in priority order:
225    /// 1. Built-in dictionaries (from library)
226    /// 2. `~/.config/base-d/dictionaries.toml` (user overrides)
227    /// 3. `./dictionaries.toml` (project-local overrides)
228    ///
229    /// Later configurations override earlier ones for matching dictionary names.
230    pub fn load_with_overrides() -> Result<Self, Box<dyn std::error::Error>> {
231        let mut config = Self::load_default()?;
232
233        // Try to load user config from ~/.config/base-d/dictionaries.toml
234        if let Some(config_dir) = dirs::config_dir() {
235            let user_config_path = config_dir.join("base-d").join("dictionaries.toml");
236            if user_config_path.exists() {
237                match Self::load_from_file(&user_config_path) {
238                    Ok(user_config) => {
239                        config.merge(user_config);
240                    }
241                    Err(e) => {
242                        eprintln!(
243                            "Warning: Failed to load user config from {:?}: {}",
244                            user_config_path, e
245                        );
246                    }
247                }
248            }
249        }
250
251        // Try to load local config from ./dictionaries.toml
252        let local_config_path = std::path::Path::new("dictionaries.toml");
253        if local_config_path.exists() {
254            match Self::load_from_file(local_config_path) {
255                Ok(local_config) => {
256                    config.merge(local_config);
257                }
258                Err(e) => {
259                    eprintln!(
260                        "Warning: Failed to load local config from {:?}: {}",
261                        local_config_path, e
262                    );
263                }
264            }
265        }
266
267        Ok(config)
268    }
269
270    /// Merges another configuration into this one.
271    ///
272    /// Dictionaries from `other` override dictionaries with the same name in `self`.
273    pub fn merge(&mut self, other: DictionaryRegistry) {
274        for (name, dictionary) in other.dictionaries {
275            self.dictionaries.insert(name, dictionary);
276        }
277    }
278
279    /// Retrieves an dictionary configuration by name.
280    pub fn get_dictionary(&self, name: &str) -> Option<&DictionaryConfig> {
281        self.dictionaries.get(name)
282    }
283
284    /// Builds a ready-to-use Dictionary from a named configuration.
285    ///
286    /// This is a convenience method that handles the common pattern of:
287    /// 1. Looking up the dictionary config
288    /// 2. Getting effective chars
289    /// 3. Building the Dictionary with proper mode/padding
290    ///
291    /// # Example
292    /// ```
293    /// # use base_d::DictionaryRegistry;
294    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
295    /// let registry = DictionaryRegistry::load_default()?;
296    /// let dict = registry.dictionary("base64")?;
297    /// let encoded = base_d::encode(b"Hello", &dict);
298    /// # Ok(())
299    /// # }
300    /// ```
301    pub fn dictionary(
302        &self,
303        name: &str,
304    ) -> Result<crate::Dictionary, crate::encoders::algorithms::errors::DictionaryNotFoundError>
305    {
306        let config = self.get_dictionary(name).ok_or_else(|| {
307            crate::encoders::algorithms::errors::DictionaryNotFoundError::new(name)
308        })?;
309
310        self.build_dictionary(config).map_err(|e| {
311            crate::encoders::algorithms::errors::DictionaryNotFoundError::with_cause(name, e)
312        })
313    }
314
315    /// Returns a random dictionary suitable for encoding.
316    ///
317    /// Only selects from dictionaries marked as `common = true` (the default).
318    /// These are dictionaries that render consistently across platforms.
319    ///
320    /// # Example
321    /// ```
322    /// # use base_d::DictionaryRegistry;
323    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
324    /// let registry = DictionaryRegistry::load_default()?;
325    /// let (name, dict) = registry.random()?;
326    /// let encoded = base_d::encode(b"Hello", &dict);
327    /// # Ok(())
328    /// # }
329    /// ```
330    pub fn random(&self) -> Result<(String, crate::Dictionary), Box<dyn std::error::Error>> {
331        use rand::seq::IteratorRandom;
332
333        let common_names: Vec<&String> = self
334            .dictionaries
335            .iter()
336            .filter(|(_, config)| config.common)
337            .map(|(name, _)| name)
338            .collect();
339
340        let name = common_names
341            .into_iter()
342            .choose(&mut rand::rng())
343            .ok_or("No common dictionaries available")?;
344
345        let dict = self.dictionary(name)?;
346        Ok((name.clone(), dict))
347    }
348
349    /// Returns a list of all dictionary names.
350    pub fn names(&self) -> Vec<&str> {
351        self.dictionaries.keys().map(|s| s.as_str()).collect()
352    }
353
354    /// Returns a list of common dictionary names (suitable for random selection).
355    pub fn common_names(&self) -> Vec<&str> {
356        self.dictionaries
357            .iter()
358            .filter(|(_, config)| config.common)
359            .map(|(name, _)| name.as_str())
360            .collect()
361    }
362
363    /// Internal helper to build a Dictionary from a DictionaryConfig.
364    fn build_dictionary(&self, config: &DictionaryConfig) -> Result<crate::Dictionary, String> {
365        use crate::core::config::EncodingMode;
366
367        let mode = config.effective_mode();
368
369        // ByteRange mode uses start_codepoint, not chars
370        if mode == EncodingMode::ByteRange {
371            let start = config
372                .start_codepoint
373                .ok_or("ByteRange mode requires start_codepoint")?;
374            return crate::Dictionary::builder()
375                .mode(mode)
376                .start_codepoint(start)
377                .build();
378        }
379
380        // Get effective chars (handles both explicit and range-based)
381        let chars_str = config.effective_chars()?;
382        let chars: Vec<char> = chars_str.chars().collect();
383
384        // Build with optional padding
385        let mut builder = crate::Dictionary::builder().chars(chars).mode(mode);
386
387        if let Some(pad_str) = &config.padding
388            && let Some(pad_char) = pad_str.chars().next()
389        {
390            builder = builder.padding(pad_char);
391        }
392
393        builder.build()
394    }
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    #[test]
402    fn test_load_default_config() {
403        let config = DictionaryRegistry::load_default().unwrap();
404        assert!(config.dictionaries.contains_key("cards"));
405    }
406
407    #[test]
408    fn test_cards_dictionary_length() {
409        let config = DictionaryRegistry::load_default().unwrap();
410        let cards = config.get_dictionary("cards").unwrap();
411        assert_eq!(cards.chars.chars().count(), 52);
412    }
413
414    #[test]
415    fn test_base64_chunked_mode() {
416        let config = DictionaryRegistry::load_default().unwrap();
417        let base64 = config.get_dictionary("base64").unwrap();
418        assert_eq!(base64.effective_mode(), EncodingMode::Chunked);
419        assert_eq!(base64.padding, Some("=".to_string()));
420    }
421
422    #[test]
423    fn test_base64_radix_mode() {
424        let config = DictionaryRegistry::load_default().unwrap();
425        let base64_radix = config.get_dictionary("base64_radix").unwrap();
426        assert_eq!(base64_radix.effective_mode(), EncodingMode::Radix);
427    }
428
429    #[test]
430    fn test_auto_detection_power_of_two() {
431        // Power of 2 → Chunked
432        let config = DictionaryConfig {
433            chars: "ABCD".to_string(), // 4 = 2^2
434            mode: None,
435            padding: None,
436            start_codepoint: None,
437            start: None,
438            length: None,
439            common: true,
440        };
441        assert_eq!(config.effective_mode(), EncodingMode::Chunked);
442
443        // Not power of 2 → Radix
444        let config = DictionaryConfig {
445            chars: "ABC".to_string(), // 3 ≠ 2^n
446            mode: None,
447            padding: None,
448            start_codepoint: None,
449            start: None,
450            length: None,
451            common: true,
452        };
453        assert_eq!(config.effective_mode(), EncodingMode::Radix);
454    }
455
456    #[test]
457    fn test_explicit_mode_override() {
458        // Explicit mode overrides auto-detection
459        let config = DictionaryConfig {
460            chars: "ABCD".to_string(),       // Would be Chunked
461            mode: Some(EncodingMode::Radix), // But explicitly set to Radix
462            padding: None,
463            start_codepoint: None,
464            start: None,
465            length: None,
466            common: true,
467        };
468        assert_eq!(config.effective_mode(), EncodingMode::Radix);
469    }
470
471    #[test]
472    fn test_merge_configs() {
473        let mut config1 = DictionaryRegistry {
474            dictionaries: HashMap::new(),
475            compression: HashMap::new(),
476            settings: Settings::default(),
477        };
478        config1.dictionaries.insert(
479            "test1".to_string(),
480            DictionaryConfig {
481                chars: "ABC".to_string(),
482                mode: Some(EncodingMode::Radix),
483                padding: None,
484                start_codepoint: None,
485                start: None,
486                length: None,
487                common: true,
488            },
489        );
490
491        let mut config2 = DictionaryRegistry {
492            dictionaries: HashMap::new(),
493            compression: HashMap::new(),
494            settings: Settings::default(),
495        };
496        config2.dictionaries.insert(
497            "test2".to_string(),
498            DictionaryConfig {
499                chars: "XYZ".to_string(),
500                mode: Some(EncodingMode::Radix),
501                padding: None,
502                start_codepoint: None,
503                start: None,
504                length: None,
505                common: true,
506            },
507        );
508        config2.dictionaries.insert(
509            "test1".to_string(),
510            DictionaryConfig {
511                chars: "DEF".to_string(),
512                mode: Some(EncodingMode::Radix),
513                padding: None,
514                start_codepoint: None,
515                start: None,
516                length: None,
517                common: true,
518            },
519        );
520
521        config1.merge(config2);
522
523        assert_eq!(config1.dictionaries.len(), 2);
524        assert_eq!(config1.get_dictionary("test1").unwrap().chars, "DEF");
525        assert_eq!(config1.get_dictionary("test2").unwrap().chars, "XYZ");
526    }
527
528    #[test]
529    fn test_load_from_toml_string() {
530        let toml_content = r#"
531[dictionaries.custom]
532chars = "0123456789"
533mode = "base_conversion"
534"#;
535        let config = DictionaryRegistry::from_toml(toml_content).unwrap();
536        assert!(config.dictionaries.contains_key("custom"));
537        assert_eq!(config.get_dictionary("custom").unwrap().chars, "0123456789");
538    }
539
540    #[test]
541    fn test_effective_chars_from_explicit() {
542        let config = DictionaryConfig {
543            chars: "ABCD".to_string(),
544            mode: None,
545            padding: None,
546            start_codepoint: None,
547            start: None,
548            length: None,
549            common: true,
550        };
551        assert_eq!(config.effective_chars().unwrap(), "ABCD");
552    }
553
554    #[test]
555    fn test_effective_chars_from_range() {
556        let config = DictionaryConfig {
557            chars: String::new(),
558            mode: None,
559            padding: None,
560            start_codepoint: None,
561            start: Some("A".to_string()),
562            length: Some(4),
563            common: true,
564        };
565        assert_eq!(config.effective_chars().unwrap(), "ABCD");
566    }
567
568    #[test]
569    fn test_effective_chars_explicit_takes_priority() {
570        // Explicit chars should override start+length
571        let config = DictionaryConfig {
572            chars: "XYZ".to_string(),
573            mode: None,
574            padding: None,
575            start_codepoint: None,
576            start: Some("A".to_string()),
577            length: Some(4),
578            common: true,
579        };
580        assert_eq!(config.effective_chars().unwrap(), "XYZ");
581    }
582
583    #[test]
584    fn test_effective_chars_unicode_range() {
585        // Test generating a range starting from a Unicode character
586        let config = DictionaryConfig {
587            chars: String::new(),
588            mode: None,
589            padding: None,
590            start_codepoint: None,
591            start: Some("가".to_string()), // Korean Hangul U+AC00
592            length: Some(4),
593            common: true,
594        };
595        let result = config.effective_chars().unwrap();
596        assert_eq!(result.chars().count(), 4);
597        assert_eq!(result, "가각갂갃");
598    }
599
600    #[test]
601    fn test_effective_chars_surrogate_gap_error() {
602        // Range crossing surrogate gap should error
603        let config = DictionaryConfig {
604            chars: String::new(),
605            mode: None,
606            padding: None,
607            start_codepoint: None,
608            start: Some("\u{D700}".to_string()), // Just before surrogates
609            length: Some(512),                   // Would cross into surrogate range
610            common: true,
611        };
612        assert!(config.effective_chars().is_err());
613    }
614
615    #[test]
616    fn test_effective_chars_exceeds_unicode_max() {
617        // Range exceeding max Unicode should error
618        let config = DictionaryConfig {
619            chars: String::new(),
620            mode: None,
621            padding: None,
622            start_codepoint: None,
623            start: Some("\u{10FFFE}".to_string()), // Near end of Unicode
624            length: Some(10),                      // Would exceed U+10FFFF
625            common: true,
626        };
627        assert!(config.effective_chars().is_err());
628    }
629
630    #[test]
631    fn test_effective_mode_with_length_field() {
632        // Auto-detect should use length field when chars is empty
633        let config = DictionaryConfig {
634            chars: String::new(),
635            mode: None,
636            padding: None,
637            start_codepoint: None,
638            start: Some("A".to_string()),
639            length: Some(64), // 64 = 2^6 → Chunked
640            common: true,
641        };
642        assert_eq!(config.effective_mode(), EncodingMode::Chunked);
643
644        let config = DictionaryConfig {
645            chars: String::new(),
646            mode: None,
647            padding: None,
648            start_codepoint: None,
649            start: Some("A".to_string()),
650            length: Some(52), // 52 ≠ 2^n → Radix
651            common: true,
652        };
653        assert_eq!(config.effective_mode(), EncodingMode::Radix);
654    }
655}