use crate::core::config::EncodingMode;
#[cfg(feature = "simd")]
use crate::simd::variants::DictionaryMetadata;
use std::collections::HashMap;
const MAX_LOOKUP_TABLE_SIZE: usize = 256;
pub fn is_safe_byte_range(start: u32) -> bool {
if start < 0x00A0 {
return false;
}
let end = match start.checked_add(255) {
Some(e) => e,
None => return false,
};
if end > 0x10FFFF {
return false;
}
if start <= 0xDFFF && end >= 0xD800 {
return false;
}
true
}
#[derive(Debug, Clone)]
pub struct Dictionary {
chars: Vec<char>,
char_to_index: HashMap<char, usize>,
lookup_table: Option<Box<[Option<usize>; 256]>>,
mode: EncodingMode,
padding: Option<char>,
start_codepoint: Option<u32>,
}
impl Dictionary {
pub fn builder() -> DictionaryBuilder {
DictionaryBuilder::new()
}
#[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
#[allow(deprecated)]
pub fn new(chars: Vec<char>) -> Result<Self, String> {
Self::new_with_mode(chars, EncodingMode::Radix, None)
}
#[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
#[allow(deprecated)]
pub fn new_with_mode(
chars: Vec<char>,
mode: EncodingMode,
padding: Option<char>,
) -> Result<Self, String> {
Self::new_with_mode_and_range(chars, mode, padding, None)
}
#[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
pub fn new_with_mode_and_range(
chars: Vec<char>,
mode: EncodingMode,
padding: Option<char>,
start_codepoint: Option<u32>,
) -> Result<Self, String> {
if mode == EncodingMode::ByteRange {
if let Some(start) = start_codepoint {
if !is_safe_byte_range(start) {
return Err(format!(
"Unsafe ByteRange start_codepoint U+{:04X}: mapped range U+{:04X}..U+{:04X} \
overlaps with dangerous codepoints (NUL U+0000, C1 controls U+0080-U+009F, \
or surrogates U+D800-U+DFFF)",
start,
start,
start + 255
));
}
return Ok(Dictionary {
chars: Vec::new(),
char_to_index: HashMap::new(),
lookup_table: None,
mode,
padding,
start_codepoint: Some(start),
});
} else {
return Err("ByteRange mode requires start_codepoint".to_string());
}
}
if chars.is_empty() {
return Err("Dictionary cannot be empty".to_string());
}
if mode == EncodingMode::Chunked {
let base = chars.len();
if !base.is_power_of_two() {
return Err(format!(
"Chunked mode requires power-of-two dictionary size, got {}",
base
));
}
if base != 2
&& base != 4
&& base != 8
&& base != 16
&& base != 32
&& base != 64
&& base != 128
&& base != 256
{
return Err(format!(
"Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
base
));
}
}
let mut char_to_index = HashMap::new();
for (i, &c) in chars.iter().enumerate() {
if char_to_index.insert(c, i).is_some() {
return Err(format!(
"Duplicate character in dictionary: '{}' (U+{:04X})",
c, c as u32
));
}
if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
return Err(format!(
"Control character not allowed in dictionary: U+{:04X}",
c as u32
));
}
if c.is_whitespace() && c != ' ' {
return Err(format!(
"Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
c, c as u32
));
}
}
if let Some(pad) = padding {
if char_to_index.contains_key(&pad) {
return Err(format!(
"Padding character '{}' conflicts with dictionary characters",
pad
));
}
if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
return Err(format!(
"Control character not allowed as padding: U+{:04X}",
pad as u32
));
}
}
let lookup_table = if chars
.iter()
.all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
{
let mut table = Box::new([None; 256]);
for (i, &c) in chars.iter().enumerate() {
table[c as usize] = Some(i);
}
Some(table)
} else {
None
};
Ok(Dictionary {
chars,
char_to_index,
lookup_table,
mode,
padding,
start_codepoint: None,
})
}
#[deprecated(
since = "0.1.0",
note = "Use Dictionary::builder().chars_from_str(s).build() instead"
)]
#[allow(deprecated, clippy::should_implement_trait)]
pub fn from_str(s: &str) -> Result<Self, String> {
let chars: Vec<char> = s.chars().collect();
Self::new(chars)
}
pub fn base(&self) -> usize {
match self.mode {
EncodingMode::ByteRange => 256,
_ => self.chars.len(),
}
}
pub fn mode(&self) -> &EncodingMode {
&self.mode
}
pub fn padding(&self) -> Option<char> {
self.padding
}
pub fn start_codepoint(&self) -> Option<u32> {
self.start_codepoint
}
pub fn encode_digit(&self, digit: usize) -> Option<char> {
match self.mode {
EncodingMode::ByteRange => {
if let Some(start) = self.start_codepoint
&& digit < 256
{
return std::char::from_u32(start + digit as u32);
}
None
}
_ => self.chars.get(digit).copied(),
}
}
pub fn decode_char(&self, c: char) -> Option<usize> {
match self.mode {
EncodingMode::ByteRange => {
if let Some(start) = self.start_codepoint {
let codepoint = c as u32;
if codepoint >= start && codepoint < start + 256 {
return Some((codepoint - start) as usize);
}
}
None
}
_ => {
if let Some(ref table) = self.lookup_table {
let char_val = c as u32;
if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
return table[char_val as usize];
}
}
self.char_to_index.get(&c).copied()
}
}
}
#[cfg(feature = "simd")]
pub fn simd_metadata(&self) -> DictionaryMetadata {
DictionaryMetadata::from_dictionary(self)
}
#[cfg(feature = "simd")]
pub fn simd_available(&self) -> bool {
self.simd_metadata().simd_available()
}
#[cfg(not(feature = "simd"))]
pub fn simd_available(&self) -> bool {
false
}
}
#[derive(Debug, Default)]
pub struct DictionaryBuilder {
chars: Option<Vec<char>>,
mode: Option<EncodingMode>,
padding: Option<char>,
start_codepoint: Option<u32>,
}
impl DictionaryBuilder {
pub fn new() -> Self {
Self {
chars: None,
mode: None,
padding: None,
start_codepoint: None,
}
}
pub fn chars(mut self, chars: Vec<char>) -> Self {
self.chars = Some(chars);
self
}
pub fn chars_from_str(mut self, s: &str) -> Self {
self.chars = Some(s.chars().collect());
self
}
pub fn mode(mut self, mode: EncodingMode) -> Self {
self.mode = Some(mode);
self
}
pub fn padding(mut self, padding: char) -> Self {
self.padding = Some(padding);
self
}
pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
self.start_codepoint = Some(start_codepoint);
self
}
#[allow(deprecated)]
pub fn build(self) -> Result<Dictionary, String> {
let mode = self.mode.unwrap_or(EncodingMode::Radix);
let chars = self.chars.unwrap_or_default();
Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_duplicate_character_detection() {
let chars = vec!['a', 'b', 'c', 'a'];
let result = Dictionary::builder().chars(chars).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("Duplicate character"));
}
#[test]
fn test_empty_dictionary() {
let chars = vec![];
let result = Dictionary::builder().chars(chars).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("cannot be empty"));
}
#[test]
fn test_chunked_mode_power_of_two() {
let chars = vec!['a', 'b', 'c']; let result = Dictionary::builder()
.chars(chars)
.mode(EncodingMode::Chunked)
.build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("power-of-two"));
}
#[test]
fn test_chunked_mode_valid_sizes() {
for &size in &[2, 4, 8, 16, 32, 64] {
let chars: Vec<char> = (0..size)
.map(|i| {
char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
})
.collect();
let result = Dictionary::builder()
.chars(chars)
.mode(EncodingMode::Chunked)
.build();
assert!(result.is_ok(), "Size {} should be valid", size);
}
}
#[test]
fn test_control_character_rejection() {
let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::builder().chars(chars).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("Control character"));
}
#[test]
fn test_whitespace_rejection() {
let chars = vec!['a', 'b', '\t', 'c'];
let result = Dictionary::builder().chars(chars).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("Whitespace"));
let chars_with_space = vec!['a', 'b', ' ', 'c'];
let result_space = Dictionary::builder().chars(chars_with_space).build();
assert!(result_space.is_ok());
}
#[test]
fn test_padding_conflict_with_dictionary() {
let chars = vec!['a', 'b', 'c', 'd'];
let result = Dictionary::builder()
.chars(chars)
.mode(EncodingMode::Radix)
.padding('b')
.build();
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.contains("Padding character"));
assert!(err.contains("conflicts"));
}
#[test]
fn test_valid_padding() {
let chars = vec!['a', 'b', 'c', 'd'];
let result = Dictionary::builder()
.chars(chars)
.mode(EncodingMode::Radix)
.padding('=')
.build();
assert!(result.is_ok());
}
#[test]
fn test_byte_range_exceeds_unicode() {
let result = Dictionary::builder()
.mode(EncodingMode::ByteRange)
.start_codepoint(0x10FF80) .build();
assert!(result.is_err());
}
#[test]
fn test_byte_range_valid_start() {
let result = Dictionary::builder()
.mode(EncodingMode::ByteRange)
.start_codepoint(0x1F300) .build();
assert!(result.is_ok());
}
#[test]
fn test_byte_range_no_start_codepoint() {
let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("requires start_codepoint"));
}
#[test]
fn test_detailed_error_messages() {
let chars = vec!['a', 'b', 'a'];
let err = Dictionary::builder().chars(chars).build().unwrap_err();
assert!(err.contains("'a'") || err.contains("U+"));
}
#[test]
fn test_builder_basic() {
let dict = Dictionary::builder()
.chars(vec!['0', '1', '2', '3'])
.build()
.unwrap();
assert_eq!(dict.base(), 4);
assert_eq!(dict.mode(), &EncodingMode::Radix);
assert_eq!(dict.padding(), None);
}
#[test]
fn test_builder_from_str() {
let dict = Dictionary::builder()
.chars_from_str("0123456789ABCDEF")
.build()
.unwrap();
assert_eq!(dict.base(), 16);
}
#[test]
fn test_builder_with_mode() {
let dict = Dictionary::builder()
.chars(vec!['0', '1'])
.mode(EncodingMode::Chunked)
.build()
.unwrap();
assert_eq!(dict.mode(), &EncodingMode::Chunked);
}
#[test]
fn test_builder_with_padding() {
let dict = Dictionary::builder()
.chars_from_str("ABCD")
.padding('=')
.build()
.unwrap();
assert_eq!(dict.padding(), Some('='));
}
#[test]
fn test_builder_byte_range() {
let dict = Dictionary::builder()
.mode(EncodingMode::ByteRange)
.start_codepoint(0x1F300)
.build()
.unwrap();
assert_eq!(dict.mode(), &EncodingMode::ByteRange);
assert_eq!(dict.start_codepoint(), Some(0x1F300));
assert_eq!(dict.base(), 256);
}
#[test]
fn test_builder_byte_range_missing_start() {
let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("requires start_codepoint"));
}
#[test]
fn test_builder_validation_duplicates() {
let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("Duplicate character"));
}
#[test]
fn test_builder_chunked_validation() {
let result = Dictionary::builder()
.chars(vec!['a', 'b', 'c']) .mode(EncodingMode::Chunked)
.build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("power-of-two"));
}
#[test]
fn test_builder_padding_conflict() {
let result = Dictionary::builder()
.chars(vec!['a', 'b', 'c'])
.padding('b')
.build();
assert!(result.is_err());
assert!(result.unwrap_err().contains("Padding character"));
}
#[test]
fn test_builder_full_config() {
let dict = Dictionary::builder()
.chars_from_str("01")
.mode(EncodingMode::Chunked)
.padding('=')
.build()
.unwrap();
assert_eq!(dict.base(), 2);
assert_eq!(dict.mode(), &EncodingMode::Chunked);
assert_eq!(dict.padding(), Some('='));
}
#[test]
fn test_is_safe_byte_range_nul() {
assert!(!is_safe_byte_range(0));
}
#[test]
fn test_is_safe_byte_range_end_of_c1() {
assert!(!is_safe_byte_range(0x009F));
}
#[test]
fn test_is_safe_byte_range_first_safe() {
assert!(is_safe_byte_range(0x00A0));
}
#[test]
fn test_is_safe_byte_range_just_below_surrogates() {
assert!(is_safe_byte_range(0xD700));
}
#[test]
fn test_is_safe_byte_range_overlaps_surrogate_start() {
assert!(!is_safe_byte_range(0xD701));
}
#[test]
fn test_is_safe_byte_range_above_surrogates() {
assert!(is_safe_byte_range(0xE000));
}
#[test]
fn test_is_safe_byte_range_at_unicode_max() {
assert!(is_safe_byte_range(0x10FF00));
}
#[test]
fn test_is_safe_byte_range_exceeds_unicode_max() {
assert!(!is_safe_byte_range(0x10FF01));
}
}