#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum SymbolizeError {
EmptyInput,
TooFewSymbols,
ConstantInput,
}
impl std::fmt::Display for SymbolizeError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EmptyInput => write!(f, "input data is empty"),
Self::TooFewSymbols => write!(f, "num_symbols must be ≥ 2"),
Self::ConstantInput => write!(f, "all values are identical; cannot bin"),
}
}
}
impl std::error::Error for SymbolizeError {}
pub fn equal_width(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
if data.is_empty() {
return Err(SymbolizeError::EmptyInput);
}
if num_symbols < 2 {
return Err(SymbolizeError::TooFewSymbols);
}
let min = data.iter().copied().fold(f64::INFINITY, f64::min);
let max = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
if (max - min).abs() < f64::EPSILON {
return Err(SymbolizeError::ConstantInput);
}
let width = (max - min) / num_symbols as f64;
let n_sym = num_symbols as u8;
let symbols = data
.iter()
.map(|&v| {
let bin = ((v - min) / width).floor() as u8;
bin.min(n_sym - 1) })
.collect();
Ok(symbols)
}
pub fn equal_frequency(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
if data.is_empty() {
return Err(SymbolizeError::EmptyInput);
}
if num_symbols < 2 {
return Err(SymbolizeError::TooFewSymbols);
}
let mut sorted: Vec<f64> = data.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let first = sorted[0];
let last = sorted[sorted.len() - 1];
if (last - first).abs() < f64::EPSILON {
return Err(SymbolizeError::ConstantInput);
}
let n = sorted.len();
let cuts: Vec<f64> = (1..num_symbols)
.map(|k| {
let idx = (k * n / num_symbols).min(n - 1);
sorted[idx]
})
.collect();
let symbols = data
.iter()
.map(|&v| {
let sym = cuts.partition_point(|&cut| v >= cut) as u8;
sym.min(num_symbols as u8 - 1)
})
.collect();
Ok(symbols)
}
#[must_use]
pub fn alphabet_size(symbols: &[u8]) -> usize {
let mut seen = [false; 256];
for &s in symbols {
seen[s as usize] = true;
}
seen.iter().filter(|&&b| b).count()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn equal_width_basic() {
let data = vec![0.0, 1.0, 2.0, 3.0];
let syms = equal_width(&data, 4).unwrap();
assert_eq!(syms, vec![0, 1, 2, 3]);
}
#[test]
fn equal_width_clamps_max() {
let data = vec![0.0, 0.5, 1.0];
let syms = equal_width(&data, 2).unwrap();
assert_eq!(syms[2], 1, "max value must map to last bin");
}
#[test]
fn equal_frequency_distributes_evenly() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let syms = equal_frequency(&data, 4).unwrap();
let mut counts = [0usize; 4];
for &s in &syms {
counts[s as usize] += 1;
}
for c in counts {
assert!(
(23..=27).contains(&c),
"bin count {c} out of expected range"
);
}
}
#[test]
fn equal_width_empty_error() {
assert_eq!(equal_width(&[], 4), Err(SymbolizeError::EmptyInput));
}
#[test]
fn equal_width_few_symbols_error() {
assert_eq!(
equal_width(&[1.0, 2.0], 1),
Err(SymbolizeError::TooFewSymbols)
);
}
#[test]
fn equal_width_constant_error() {
assert_eq!(
equal_width(&[5.0, 5.0, 5.0], 4),
Err(SymbolizeError::ConstantInput)
);
}
#[test]
fn equal_frequency_constant_error() {
assert_eq!(
equal_frequency(&[3.0, 3.0], 2),
Err(SymbolizeError::ConstantInput)
);
}
#[test]
fn alphabet_size_counts_distinct() {
let syms = vec![0u8, 1, 2, 1, 0, 3];
assert_eq!(alphabet_size(&syms), 4);
}
#[test]
fn alphabet_size_single_symbol() {
let syms = vec![7u8; 100];
assert_eq!(alphabet_size(&syms), 1);
}
}
#[derive(Debug, Clone)]
pub struct WordSymbolizer {
num_symbols: usize,
}
impl WordSymbolizer {
#[must_use]
pub fn new(num_symbols: usize) -> Self {
assert!(num_symbols >= 2, "num_symbols must be ≥ 2");
Self { num_symbols }
}
pub fn symbolize(&self, text: &str) -> Result<Vec<u8>, SymbolizeError> {
let words: Vec<&str> = text.split_whitespace().collect();
if words.is_empty() {
return Err(SymbolizeError::EmptyInput);
}
let mut freq_map = std::collections::HashMap::new();
for &word in &words {
*freq_map.entry(word).or_insert(0usize) += 1;
}
let mut freqs: Vec<usize> = freq_map.values().copied().collect();
freqs.sort_unstable();
let n = freqs.len();
let cuts: Vec<usize> = (1..self.num_symbols)
.map(|k| {
let idx = (k * n / self.num_symbols).min(n - 1);
freqs[idx]
})
.collect();
let symbols: Vec<u8> = words
.iter()
.map(|&word| {
let freq = freq_map[word];
let sym = cuts.partition_point(|&cut| freq >= cut) as u8;
sym.min(self.num_symbols as u8 - 1)
})
.collect();
Ok(symbols)
}
}
#[cfg(test)]
mod word_tests {
use super::*;
#[test]
fn word_symbolizer_basic() {
let text = "the quick brown fox jumps over the lazy dog the fox";
let symbolizer = WordSymbolizer::new(4);
let symbols = symbolizer.symbolize(text).unwrap();
assert_eq!(symbols.len(), 11);
let alpha_size = alphabet_size(&symbols);
assert!(alpha_size >= 2 && alpha_size <= 4);
}
#[test]
fn word_symbolizer_repetitive_text() {
let text = "home about blog contact home about blog contact";
let symbolizer = WordSymbolizer::new(4);
let symbols = symbolizer.symbolize(text).unwrap();
let alpha_size = alphabet_size(&symbols);
assert_eq!(alpha_size, 1, "repetitive text should collapse to single symbol");
}
#[test]
fn word_symbolizer_empty_error() {
let symbolizer = WordSymbolizer::new(4);
assert_eq!(symbolizer.symbolize(""), Err(SymbolizeError::EmptyInput));
assert_eq!(symbolizer.symbolize(" "), Err(SymbolizeError::EmptyInput));
}
#[test]
fn word_symbolizer_single_word() {
let symbolizer = WordSymbolizer::new(4);
let symbols = symbolizer.symbolize("hello").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(alphabet_size(&symbols), 1);
}
#[test]
fn word_symbolizer_diverse_text() {
let text = "alpha beta gamma delta epsilon zeta eta theta";
let symbolizer = WordSymbolizer::new(4);
let symbols = symbolizer.symbolize(text).unwrap();
let alpha_size = alphabet_size(&symbols);
assert_eq!(alpha_size, 1, "uniform frequency should collapse to single symbol");
}
#[test]
#[should_panic(expected = "num_symbols must be ≥ 2")]
fn word_symbolizer_invalid_num_symbols() {
let _ = WordSymbolizer::new(1);
}
}