base-d 3.0.34 - Docs.rs

use crate::core::config::EncodingMode;
#[cfg(feature = "simd")]
use crate::simd::variants::DictionaryMetadata;
use std::collections::HashMap;

const MAX_LOOKUP_TABLE_SIZE: usize = 256;

/// Checks whether a ByteRange dictionary with the given `start_codepoint` is safe
/// for encoding all 256 possible byte values (0-255).
///
/// A range is considered **unsafe** if any mapped codepoint falls in:
/// - U+0000 (NUL) -- causes CString/git failures
/// - U+0001..=U+001F (C0 control characters) -- non-printable, break terminals/parsers
/// - U+007F (DEL) -- non-printable control character
/// - U+0080..=U+009F (C1 control characters) -- non-printable, break terminals/parsers
/// - U+D800..=U+DFFF (surrogates) -- invalid in UTF-8, `char::from_u32` returns None
///
/// A safe `start_codepoint` must satisfy:
/// - `start >= 0x00A0` (skips NUL, C0 controls, DEL, and C1 controls)
/// - `start + 255 < 0xD800` OR `start > 0xDFFF` (avoids surrogate gap)
/// - `start + 255 <= 0x10FFFF` (stays within Unicode)
pub fn is_safe_byte_range(start: u32) -> bool {
    // Must be above C1 control range (avoids NUL U+0000 and C1 controls U+0080-U+009F)
    if start < 0x00A0 {
        return false;
    }

    // end codepoint for byte 255
    let end = match start.checked_add(255) {
        Some(e) => e,
        None => return false,
    };

    // Must not exceed Unicode maximum
    if end > 0x10FFFF {
        return false;
    }

    // Must not overlap with surrogate range U+D800..=U+DFFF
    // Overlap occurs when start <= 0xDFFF AND end >= 0xD800
    if start <= 0xDFFF && end >= 0xD800 {
        return false;
    }

    true
}

/// Represents an encoding dictionary with its characters and configuration.
///
/// An dictionary defines the character set and encoding mode used for converting
/// binary data to text. Supports three modes: mathematical base conversion,
/// chunked (RFC 4648), and byte-range mapping.
#[derive(Debug, Clone)]
pub struct Dictionary {
    chars: Vec<char>,
    char_to_index: HashMap<char, usize>,
    // Fast lookup table for ASCII/extended ASCII characters
    lookup_table: Option<Box<[Option<usize>; 256]>>,
    mode: EncodingMode,
    padding: Option<char>,
    start_codepoint: Option<u32>,
}

impl Dictionary {
    /// Creates a new DictionaryBuilder for constructing a Dictionary.
    ///
    /// # Example
    ///
    /// ```
    /// use base_d::{Dictionary, EncodingMode};
    /// let dict = Dictionary::builder()
    ///     .chars_from_str("0123456789ABCDEF")
    ///     .mode(EncodingMode::Radix)
    ///     .build()
    ///     .unwrap();
    /// ```
    pub fn builder() -> DictionaryBuilder {
        DictionaryBuilder::new()
    }

    /// Creates a new dictionary with default settings (Radix mode, no padding).
    ///
    /// # Arguments
    ///
    /// * `chars` - Vector of characters to use in the dictionary
    ///
    /// # Errors
    ///
    /// Returns an error if the dictionary is empty or contains duplicate characters.
    ///
    /// # Deprecated
    ///
    /// Use `Dictionary::builder()` instead for more flexible configuration.
    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
    #[allow(deprecated)]
    pub fn new(chars: Vec<char>) -> Result<Self, String> {
        Self::new_with_mode(chars, EncodingMode::Radix, None)
    }

    /// Creates a new dictionary with specified encoding mode and optional padding.
    ///
    /// # Arguments
    ///
    /// * `chars` - Vector of characters to use in the dictionary
    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
    /// * `padding` - Optional padding character (typically '=' for RFC modes)
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - The dictionary is empty or contains duplicates
    /// - Chunked mode is used with a non-power-of-two dictionary size
    ///
    /// # Deprecated
    ///
    /// Use `Dictionary::builder()` instead for more flexible configuration.
    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
    #[allow(deprecated)]
    pub fn new_with_mode(
        chars: Vec<char>,
        mode: EncodingMode,
        padding: Option<char>,
    ) -> Result<Self, String> {
        Self::new_with_mode_and_range(chars, mode, padding, None)
    }

    /// Creates a new dictionary with full configuration including byte-range support.
    ///
    /// # Arguments
    ///
    /// * `chars` - Vector of characters (empty for ByteRange mode)
    /// * `mode` - Encoding mode
    /// * `padding` - Optional padding character
    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
    ///
    /// # Errors
    ///
    /// Returns an error if configuration is invalid for the specified mode.
    ///
    /// # Deprecated
    ///
    /// Use `Dictionary::builder()` instead for more flexible configuration.
    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
    pub fn new_with_mode_and_range(
        chars: Vec<char>,
        mode: EncodingMode,
        padding: Option<char>,
        start_codepoint: Option<u32>,
    ) -> Result<Self, String> {
        // ByteRange mode doesn't need chars, just validates start_codepoint
        if mode == EncodingMode::ByteRange {
            if let Some(start) = start_codepoint {
                // Validate that we can represent all 256 bytes
                // Validate safety: reject ranges that map bytes to NUL, C1 controls,
                // or surrogates. These produce encoded strings that break git, terminals,
                // and other text-processing tools.
                if !is_safe_byte_range(start) {
                    return Err(format!(
                        "Unsafe ByteRange start_codepoint U+{:04X}: mapped range U+{:04X}..U+{:04X} \
                         overlaps with dangerous codepoints (NUL U+0000, C1 controls U+0080-U+009F, \
                         or surrogates U+D800-U+DFFF)",
                        start,
                        start,
                        start + 255
                    ));
                }

                return Ok(Dictionary {
                    chars: Vec::new(),
                    char_to_index: HashMap::new(),
                    lookup_table: None,
                    mode,
                    padding,
                    start_codepoint: Some(start),
                });
            } else {
                return Err("ByteRange mode requires start_codepoint".to_string());
            }
        }

        if chars.is_empty() {
            return Err("Dictionary cannot be empty".to_string());
        }

        // Validate dictionary size for chunked mode
        if mode == EncodingMode::Chunked {
            let base = chars.len();
            if !base.is_power_of_two() {
                return Err(format!(
                    "Chunked mode requires power-of-two dictionary size, got {}",
                    base
                ));
            }
            // Additional check: ensure we have valid sizes for chunked mode
            if base != 2
                && base != 4
                && base != 8
                && base != 16
                && base != 32
                && base != 64
                && base != 128
                && base != 256
            {
                return Err(format!(
                    "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
                    base
                ));
            }
        }

        // Validate character properties
        let mut char_to_index = HashMap::new();
        for (i, &c) in chars.iter().enumerate() {
            // Check for duplicate characters
            if char_to_index.insert(c, i).is_some() {
                return Err(format!(
                    "Duplicate character in dictionary: '{}' (U+{:04X})",
                    c, c as u32
                ));
            }

            // Check for invalid Unicode characters
            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
                return Err(format!(
                    "Control character not allowed in dictionary: U+{:04X}",
                    c as u32
                ));
            }

            // Check for whitespace (allow space for RFC-compliant encodings like Base45)
            if c.is_whitespace() && c != ' ' {
                return Err(format!(
                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
                    c, c as u32
                ));
            }
        }

        // Validate padding character if present
        if let Some(pad) = padding {
            if char_to_index.contains_key(&pad) {
                return Err(format!(
                    "Padding character '{}' conflicts with dictionary characters",
                    pad
                ));
            }
            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
                return Err(format!(
                    "Control character not allowed as padding: U+{:04X}",
                    pad as u32
                ));
            }
        }

        // Build fast lookup table for ASCII characters
        let lookup_table = if chars
            .iter()
            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
        {
            let mut table = Box::new([None; 256]);
            for (i, &c) in chars.iter().enumerate() {
                table[c as usize] = Some(i);
            }
            Some(table)
        } else {
            None
        };

        Ok(Dictionary {
            chars,
            char_to_index,
            lookup_table,
            mode,
            padding,
            start_codepoint: None,
        })
    }

    /// Creates an dictionary from a string of characters.
    ///
    /// # Arguments
    ///
    /// * `s` - String containing the dictionary characters
    ///
    /// # Deprecated
    ///
    /// Use `Dictionary::builder().chars_from_str(s).build()` instead.
    #[deprecated(
        since = "0.1.0",
        note = "Use Dictionary::builder().chars_from_str(s).build() instead"
    )]
    #[allow(deprecated, clippy::should_implement_trait)]
    pub fn from_str(s: &str) -> Result<Self, String> {
        let chars: Vec<char> = s.chars().collect();
        Self::new(chars)
    }

    /// Returns the base (radix) of the dictionary.
    ///
    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
    pub fn base(&self) -> usize {
        match self.mode {
            EncodingMode::ByteRange => 256,
            _ => self.chars.len(),
        }
    }

    /// Returns the encoding mode of this dictionary.
    pub fn mode(&self) -> &EncodingMode {
        &self.mode
    }

    /// Returns the padding character, if any.
    pub fn padding(&self) -> Option<char> {
        self.padding
    }

    /// Returns the starting Unicode codepoint for ByteRange mode.
    pub fn start_codepoint(&self) -> Option<u32> {
        self.start_codepoint
    }

    /// Encodes a digit (0 to base-1) as a character.
    ///
    /// Returns `None` if the digit is out of range.
    pub fn encode_digit(&self, digit: usize) -> Option<char> {
        match self.mode {
            EncodingMode::ByteRange => {
                if let Some(start) = self.start_codepoint
                    && digit < 256
                {
                    return std::char::from_u32(start + digit as u32);
                }
                None
            }
            _ => self.chars.get(digit).copied(),
        }
    }

    /// Decodes a character back to its digit value.
    ///
    /// Returns `None` if the character is not in the dictionary.
    pub fn decode_char(&self, c: char) -> Option<usize> {
        match self.mode {
            EncodingMode::ByteRange => {
                if let Some(start) = self.start_codepoint {
                    let codepoint = c as u32;
                    if codepoint >= start && codepoint < start + 256 {
                        return Some((codepoint - start) as usize);
                    }
                }
                None
            }
            _ => {
                // Use fast lookup table for ASCII characters
                if let Some(ref table) = self.lookup_table {
                    let char_val = c as u32;
                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
                        return table[char_val as usize];
                    }
                }
                // Fall back to HashMap for non-ASCII
                self.char_to_index.get(&c).copied()
            }
        }
    }

    /// Returns SIMD metadata for this dictionary.
    ///
    /// This provides information about whether SIMD acceleration is available
    /// for this dictionary and which implementation to use.
    #[cfg(feature = "simd")]
    pub fn simd_metadata(&self) -> DictionaryMetadata {
        DictionaryMetadata::from_dictionary(self)
    }

    /// Returns whether SIMD acceleration is available for this dictionary.
    ///
    /// This is a convenience method that checks if SIMD can be used with
    /// the current CPU features and dictionary configuration.
    #[cfg(feature = "simd")]
    pub fn simd_available(&self) -> bool {
        self.simd_metadata().simd_available()
    }

    /// Returns whether SIMD acceleration is available for this dictionary.
    ///
    /// When the `simd` feature is disabled, this always returns `false`.
    #[cfg(not(feature = "simd"))]
    pub fn simd_available(&self) -> bool {
        false
    }
}

/// Builder for constructing a Dictionary with flexible configuration.
///
/// # Example
///
/// ```
/// use base_d::{Dictionary, EncodingMode};
/// let dict = Dictionary::builder()
///     .chars_from_str("0123456789ABCDEF")
///     .mode(EncodingMode::Radix)
///     .build()
///     .unwrap();
/// ```
#[derive(Debug, Default)]
pub struct DictionaryBuilder {
    chars: Option<Vec<char>>,
    mode: Option<EncodingMode>,
    padding: Option<char>,
    start_codepoint: Option<u32>,
}

impl DictionaryBuilder {
    /// Creates a new DictionaryBuilder with default settings.
    pub fn new() -> Self {
        Self {
            chars: None,
            mode: None,
            padding: None,
            start_codepoint: None,
        }
    }

    /// Sets the dictionary characters from a vector.
    ///
    /// # Arguments
    ///
    /// * `chars` - Vector of characters to use in the dictionary
    pub fn chars(mut self, chars: Vec<char>) -> Self {
        self.chars = Some(chars);
        self
    }

    /// Sets the dictionary characters from a string.
    ///
    /// # Arguments
    ///
    /// * `s` - String containing the dictionary characters
    pub fn chars_from_str(mut self, s: &str) -> Self {
        self.chars = Some(s.chars().collect());
        self
    }

    /// Sets the encoding mode.
    ///
    /// # Arguments
    ///
    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
    pub fn mode(mut self, mode: EncodingMode) -> Self {
        self.mode = Some(mode);
        self
    }

    /// Sets the padding character.
    ///
    /// # Arguments
    ///
    /// * `padding` - Padding character (typically '=' for RFC modes)
    pub fn padding(mut self, padding: char) -> Self {
        self.padding = Some(padding);
        self
    }

    /// Sets the starting Unicode codepoint for ByteRange mode.
    ///
    /// # Arguments
    ///
    /// * `start_codepoint` - Starting Unicode codepoint
    pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
        self.start_codepoint = Some(start_codepoint);
        self
    }

    /// Builds the Dictionary with the configured settings.
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - The configuration is invalid for the specified mode
    /// - Required fields are missing
    /// - Validation fails (duplicates, invalid characters, etc.)
    #[allow(deprecated)]
    pub fn build(self) -> Result<Dictionary, String> {
        let mode = self.mode.unwrap_or(EncodingMode::Radix);
        let chars = self.chars.unwrap_or_default();

        Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_duplicate_character_detection() {
        let chars = vec!['a', 'b', 'c', 'a'];
        let result = Dictionary::builder().chars(chars).build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("Duplicate character"));
    }

    #[test]
    fn test_empty_dictionary() {
        let chars = vec![];
        let result = Dictionary::builder().chars(chars).build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("cannot be empty"));
    }

    #[test]
    fn test_chunked_mode_power_of_two() {
        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
        let result = Dictionary::builder()
            .chars(chars)
            .mode(EncodingMode::Chunked)
            .build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("power-of-two"));
    }

    #[test]
    fn test_chunked_mode_valid_sizes() {
        // Test all valid chunked sizes
        for &size in &[2, 4, 8, 16, 32, 64] {
            let chars: Vec<char> = (0..size)
                .map(|i| {
                    // Use a wider range of Unicode characters
                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
                })
                .collect();
            let result = Dictionary::builder()
                .chars(chars)
                .mode(EncodingMode::Chunked)
                .build();
            assert!(result.is_ok(), "Size {} should be valid", size);
        }
    }

    #[test]
    fn test_control_character_rejection() {
        let chars = vec!['a', 'b', '\x00', 'c']; // null character
        let result = Dictionary::builder().chars(chars).build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("Control character"));
    }

    #[test]
    fn test_whitespace_rejection() {
        // Tab should be rejected (only space is allowed for RFC encodings like Base45)
        let chars = vec!['a', 'b', '\t', 'c'];
        let result = Dictionary::builder().chars(chars).build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("Whitespace"));

        // But space should be allowed (for Base45 RFC 9285 compliance)
        let chars_with_space = vec!['a', 'b', ' ', 'c'];
        let result_space = Dictionary::builder().chars(chars_with_space).build();
        assert!(result_space.is_ok());
    }

    #[test]
    fn test_padding_conflict_with_dictionary() {
        let chars = vec!['a', 'b', 'c', 'd'];
        let result = Dictionary::builder()
            .chars(chars)
            .mode(EncodingMode::Radix)
            .padding('b')
            .build();
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(err.contains("Padding character"));
        assert!(err.contains("conflicts"));
    }

    #[test]
    fn test_valid_padding() {
        let chars = vec!['a', 'b', 'c', 'd'];
        let result = Dictionary::builder()
            .chars(chars)
            .mode(EncodingMode::Radix)
            .padding('=')
            .build();
        assert!(result.is_ok());
    }

    #[test]
    fn test_byte_range_exceeds_unicode() {
        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
        let result = Dictionary::builder()
            .mode(EncodingMode::ByteRange)
            .start_codepoint(0x10FF80) // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
            .build();
        assert!(result.is_err());
    }

    #[test]
    fn test_byte_range_valid_start() {
        let result = Dictionary::builder()
            .mode(EncodingMode::ByteRange)
            .start_codepoint(0x1F300) // Valid start in emoji range
            .build();
        assert!(result.is_ok());
    }

    #[test]
    fn test_byte_range_no_start_codepoint() {
        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
        assert!(result.is_err());
        assert!(result.unwrap_err().contains("requires start_codepoint"));
    }

    #[test]
    fn test_detailed_error_messages() {
        // Test that error messages include useful information
        let chars = vec!['a', 'b', 'a'];
        let err = Dictionary::builder().chars(chars).build().unwrap_err();
        assert!(err.contains("'a'") || err.contains("U+"));
    }

    // DictionaryBuilder tests
    #[test]
    fn test_builder_basic() {
        let dict = Dictionary::builder()
            .chars(vec!['0', '1', '2', '3'])
            .build()
            .unwrap();

        assert_eq!(dict.base(), 4);
        assert_eq!(dict.mode(), &EncodingMode::Radix);
        assert_eq!(dict.padding(), None);
    }

    #[test]
    fn test_builder_from_str() {
        let dict = Dictionary::builder()
            .chars_from_str("0123456789ABCDEF")
            .build()
            .unwrap();

        assert_eq!(dict.base(), 16);
    }

    #[test]
    fn test_builder_with_mode() {
        let dict = Dictionary::builder()
            .chars(vec!['0', '1'])
            .mode(EncodingMode::Chunked)
            .build()
            .unwrap();

        assert_eq!(dict.mode(), &EncodingMode::Chunked);
    }

    #[test]
    fn test_builder_with_padding() {
        let dict = Dictionary::builder()
            .chars_from_str("ABCD")
            .padding('=')
            .build()
            .unwrap();

        assert_eq!(dict.padding(), Some('='));
    }

    #[test]
    fn test_builder_byte_range() {
        let dict = Dictionary::builder()
            .mode(EncodingMode::ByteRange)
            .start_codepoint(0x1F300)
            .build()
            .unwrap();

        assert_eq!(dict.mode(), &EncodingMode::ByteRange);
        assert_eq!(dict.start_codepoint(), Some(0x1F300));
        assert_eq!(dict.base(), 256);
    }

    #[test]
    fn test_builder_byte_range_missing_start() {
        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();

        assert!(result.is_err());
        assert!(result.unwrap_err().contains("requires start_codepoint"));
    }

    #[test]
    fn test_builder_validation_duplicates() {
        let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();

        assert!(result.is_err());
        assert!(result.unwrap_err().contains("Duplicate character"));
    }

    #[test]
    fn test_builder_chunked_validation() {
        let result = Dictionary::builder()
            .chars(vec!['a', 'b', 'c']) // 3 is not power of 2
            .mode(EncodingMode::Chunked)
            .build();

        assert!(result.is_err());
        assert!(result.unwrap_err().contains("power-of-two"));
    }

    #[test]
    fn test_builder_padding_conflict() {
        let result = Dictionary::builder()
            .chars(vec!['a', 'b', 'c'])
            .padding('b')
            .build();

        assert!(result.is_err());
        assert!(result.unwrap_err().contains("Padding character"));
    }

    #[test]
    fn test_builder_full_config() {
        let dict = Dictionary::builder()
            .chars_from_str("01")
            .mode(EncodingMode::Chunked)
            .padding('=')
            .build()
            .unwrap();

        assert_eq!(dict.base(), 2);
        assert_eq!(dict.mode(), &EncodingMode::Chunked);
        assert_eq!(dict.padding(), Some('='));
    }

    // --- is_safe_byte_range boundary tests ---

    #[test]
    fn test_is_safe_byte_range_nul() {
        // start=0 maps byte 0 to U+0000 (NUL) -- unsafe
        assert!(!is_safe_byte_range(0));
    }

    #[test]
    fn test_is_safe_byte_range_end_of_c1() {
        // start=0x009F: end = 0x009F+255 = 0x019E, but start itself is in C1 range -- unsafe
        assert!(!is_safe_byte_range(0x009F));
    }

    #[test]
    fn test_is_safe_byte_range_first_safe() {
        // start=0x00A0: end = 0x00A0+255 = 0x019F, all valid printable codepoints -- safe
        assert!(is_safe_byte_range(0x00A0));
    }

    #[test]
    fn test_is_safe_byte_range_just_below_surrogates() {
        // start=0xD700: end = 0xD700+255 = 0xD7FF, just below surrogate start -- safe
        assert!(is_safe_byte_range(0xD700));
    }

    #[test]
    fn test_is_safe_byte_range_overlaps_surrogate_start() {
        // start=0xD701: end = 0xD701+255 = 0xD800, overlaps surrogate start -- unsafe
        assert!(!is_safe_byte_range(0xD701));
    }

    #[test]
    fn test_is_safe_byte_range_above_surrogates() {
        // start=0xE000: above surrogate range, all valid -- safe
        assert!(is_safe_byte_range(0xE000));
    }

    #[test]
    fn test_is_safe_byte_range_at_unicode_max() {
        // start=0x10FF00: end = 0x10FF00+255 = 0x10FFFF, exactly at Unicode max -- safe
        assert!(is_safe_byte_range(0x10FF00));
    }

    #[test]
    fn test_is_safe_byte_range_exceeds_unicode_max() {
        // start=0x10FF01: end = 0x10FF01+255 = 0x110000, exceeds Unicode max -- unsafe
        assert!(!is_safe_byte_range(0x10FF01));
    }
}