use sha2::{Digest, Sha256};
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DictionaryError {
TooSmall,
DuplicateSymbol(char),
UnknownSymbol(char),
IndexOutOfRange(u32),
}
pub struct Dictionary {
symbols: Vec<char>,
inverse: HashMap<char, u32>,
}
impl Dictionary {
pub fn new(symbols: Vec<char>) -> Result<Self, DictionaryError> {
if symbols.len() < 2 {
return Err(DictionaryError::TooSmall);
}
let mut inverse = HashMap::with_capacity(symbols.len());
for (idx, &sym) in symbols.iter().enumerate() {
if inverse.insert(sym, idx as u32).is_some() {
return Err(DictionaryError::DuplicateSymbol(sym));
}
}
Ok(Self { symbols, inverse })
}
pub fn base(&self) -> u32 {
self.symbols.len() as u32
}
pub fn index_to_symbol(&self, index: u32) -> Option<char> {
self.symbols.get(index as usize).copied()
}
pub fn symbol_to_index(&self, symbol: char) -> Option<u32> {
self.inverse.get(&symbol).copied()
}
pub fn fingerprint(&self) -> [u8; 8] {
let mut hasher = Sha256::new();
for &sym in &self.symbols {
hasher.update((sym as u32).to_be_bytes());
}
hasher.finalize()[0..8]
.try_into()
.expect("SHA-256 produce >= 8 bytes")
}
pub fn encode(&self, indices: &[u32]) -> Result<String, DictionaryError> {
let mut out = String::with_capacity(indices.len());
for &index in indices {
let sym = self
.index_to_symbol(index)
.ok_or(DictionaryError::IndexOutOfRange(index))?;
out.push(sym);
}
Ok(out)
}
pub fn decode(&self, text: &str) -> Result<Vec<u32>, DictionaryError> {
text.chars()
.map(|sym| {
self.symbol_to_index(sym)
.ok_or(DictionaryError::UnknownSymbol(sym))
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prelude::*;
#[test]
fn maps_indices_to_symbols_and_back() {
let dict = Dictionary::new(vec!['A', 'B', 'C', 'D']).unwrap();
let symbols = dict.encode(&[0, 1, 2, 3, 2, 1, 0]).unwrap();
assert_eq!(symbols, "ABCDCBA");
let indices = dict.decode("ABCDCBA").unwrap();
assert_eq!(indices, vec![0, 1, 2, 3, 2, 1, 0]);
}
#[test]
fn rejects_alphabet_smaller_than_two() {
assert!(matches!(
Dictionary::new(vec!['A']),
Err(DictionaryError::TooSmall)
));
}
#[test]
fn rejects_duplicate_symbol() {
assert!(matches!(
Dictionary::new(vec!['A', 'B', 'A']),
Err(DictionaryError::DuplicateSymbol('A'))
));
}
#[test]
fn decode_rejects_unknown_symbol() {
let dict = Dictionary::new(vec!['A', 'B']).unwrap();
assert_eq!(dict.decode("AZ"), Err(DictionaryError::UnknownSymbol('Z')));
}
#[test]
fn encode_rejects_index_out_of_range() {
let dict = Dictionary::new(vec!['A', 'B']).unwrap();
assert_eq!(
dict.encode(&[0, 2]),
Err(DictionaryError::IndexOutOfRange(2))
);
}
#[test]
fn fingerprint_is_stable_and_distinguishes_alphabets() {
let d1 = Dictionary::new(vec!['A', 'B', 'C']).unwrap();
let d1_again = Dictionary::new(vec!['A', 'B', 'C']).unwrap();
let d2 = Dictionary::new(vec!['A', 'B', 'D']).unwrap();
assert_eq!(d1.fingerprint(), d1_again.fingerprint());
assert_ne!(d1.fingerprint(), d2.fingerprint());
}
proptest! {
#[test]
fn codec_plus_dictionary_round_trip(
data in proptest::collection::vec(any::<u8>(), 0..200),
) {
let symbols: Vec<char> = (0x21u8..=0x7e).map(|b| b as char).collect();
let dict = Dictionary::new(symbols).unwrap();
let n = dict.base();
let indices = crate::codec::encode_base_n(&data, n);
let text = dict.encode(&indices).unwrap();
let back_indices = dict.decode(&text).unwrap();
let back = crate::codec::decode_base_n(&back_indices, n);
prop_assert_eq!(back, data);
}
}
}