use std::collections::BTreeMap;
#[derive(Debug, Clone)]
pub struct DictEncoding {
dict: BTreeMap<String, u32>,
reverse: Vec<String>,
codes: Vec<u32>,
}
impl DictEncoding {
pub fn encode(data: &[String]) -> Self {
let mut unique: BTreeMap<String, u32> = BTreeMap::new();
for s in data {
unique.entry(s.clone()).or_insert(0);
}
let mut reverse = Vec::with_capacity(unique.len());
for (i, (key, code)) in unique.iter_mut().enumerate() {
*code = i as u32;
reverse.push(key.clone());
}
let codes: Vec<u32> = data.iter().map(|s| unique[s]).collect();
DictEncoding {
dict: unique,
reverse,
codes,
}
}
pub fn decode(&self) -> Vec<String> {
self.codes
.iter()
.map(|&c| self.reverse[c as usize].clone())
.collect()
}
pub fn lookup(&self, value: &str) -> Option<u32> {
self.dict.get(value).copied()
}
pub fn cardinality(&self) -> usize {
self.reverse.len()
}
pub fn codes(&self) -> &[u32] {
&self.codes
}
pub fn dict(&self) -> &BTreeMap<String, u32> {
&self.dict
}
pub fn reverse(&self) -> &[String] {
&self.reverse
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_roundtrip() {
let data: Vec<String> = vec!["banana", "apple", "cherry", "apple", "banana"]
.into_iter()
.map(String::from)
.collect();
let enc = DictEncoding::encode(&data);
assert_eq!(enc.decode(), data);
}
#[test]
fn test_sorted_codes() {
let data: Vec<String> = vec!["cherry", "apple", "banana"]
.into_iter()
.map(String::from)
.collect();
let enc = DictEncoding::encode(&data);
assert_eq!(enc.lookup("apple"), Some(0));
assert_eq!(enc.lookup("banana"), Some(1));
assert_eq!(enc.lookup("cherry"), Some(2));
assert_eq!(enc.cardinality(), 3);
}
#[test]
fn test_empty() {
let enc = DictEncoding::encode(&[]);
assert_eq!(enc.cardinality(), 0);
assert!(enc.codes().is_empty());
assert!(enc.decode().is_empty());
}
}