epsilon_engine/
symbolize.rs1#[derive(Debug, Clone, PartialEq, Eq)]
27#[non_exhaustive]
28pub enum SymbolizeError {
29 EmptyInput,
31 TooFewSymbols,
33 ConstantInput,
35}
36
37impl std::fmt::Display for SymbolizeError {
38 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39 match self {
40 Self::EmptyInput => write!(f, "input data is empty"),
41 Self::TooFewSymbols => write!(f, "num_symbols must be ≥ 2"),
42 Self::ConstantInput => write!(f, "all values are identical; cannot bin"),
43 }
44 }
45}
46
47impl std::error::Error for SymbolizeError {}
48
49pub fn equal_width(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
61 if data.is_empty() {
62 return Err(SymbolizeError::EmptyInput);
63 }
64 if num_symbols < 2 {
65 return Err(SymbolizeError::TooFewSymbols);
66 }
67
68 let min = data.iter().copied().fold(f64::INFINITY, f64::min);
69 let max = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
70
71 if (max - min).abs() < f64::EPSILON {
72 return Err(SymbolizeError::ConstantInput);
73 }
74
75 let width = (max - min) / num_symbols as f64;
76 let n_sym = num_symbols as u8;
77
78 let symbols = data
79 .iter()
80 .map(|&v| {
81 let bin = ((v - min) / width).floor() as u8;
82 bin.min(n_sym - 1) })
84 .collect();
85
86 Ok(symbols)
87}
88
89pub fn equal_frequency(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
104 if data.is_empty() {
105 return Err(SymbolizeError::EmptyInput);
106 }
107 if num_symbols < 2 {
108 return Err(SymbolizeError::TooFewSymbols);
109 }
110
111 let mut sorted: Vec<f64> = data.to_vec();
113 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
114
115 let first = sorted[0];
116 let last = sorted[sorted.len() - 1];
117 if (last - first).abs() < f64::EPSILON {
118 return Err(SymbolizeError::ConstantInput);
119 }
120
121 let n = sorted.len();
124 let cuts: Vec<f64> = (1..num_symbols)
125 .map(|k| {
126 let idx = (k * n / num_symbols).min(n - 1);
127 sorted[idx]
128 })
129 .collect();
130
131 let symbols = data
132 .iter()
133 .map(|&v| {
134 let sym = cuts.partition_point(|&cut| v >= cut) as u8;
136 sym.min(num_symbols as u8 - 1)
137 })
138 .collect();
139
140 Ok(symbols)
141}
142
143#[must_use]
150pub fn alphabet_size(symbols: &[u8]) -> usize {
151 let mut seen = [false; 256];
152 for &s in symbols {
153 seen[s as usize] = true;
154 }
155 seen.iter().filter(|&&b| b).count()
156}
157
158#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn equal_width_basic() {
166 let data = vec![0.0, 1.0, 2.0, 3.0];
167 let syms = equal_width(&data, 4).unwrap();
168 assert_eq!(syms, vec![0, 1, 2, 3]);
170 }
171
172 #[test]
173 fn equal_width_clamps_max() {
174 let data = vec![0.0, 0.5, 1.0];
176 let syms = equal_width(&data, 2).unwrap();
177 assert_eq!(syms[2], 1, "max value must map to last bin");
179 }
180
181 #[test]
182 fn equal_frequency_distributes_evenly() {
183 let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
185 let syms = equal_frequency(&data, 4).unwrap();
186 let mut counts = [0usize; 4];
187 for &s in &syms {
188 counts[s as usize] += 1;
189 }
190 for c in counts {
192 assert!(
193 (23..=27).contains(&c),
194 "bin count {c} out of expected range"
195 );
196 }
197 }
198
199 #[test]
200 fn equal_width_empty_error() {
201 assert_eq!(equal_width(&[], 4), Err(SymbolizeError::EmptyInput));
202 }
203
204 #[test]
205 fn equal_width_few_symbols_error() {
206 assert_eq!(
207 equal_width(&[1.0, 2.0], 1),
208 Err(SymbolizeError::TooFewSymbols)
209 );
210 }
211
212 #[test]
213 fn equal_width_constant_error() {
214 assert_eq!(
215 equal_width(&[5.0, 5.0, 5.0], 4),
216 Err(SymbolizeError::ConstantInput)
217 );
218 }
219
220 #[test]
221 fn equal_frequency_constant_error() {
222 assert_eq!(
223 equal_frequency(&[3.0, 3.0], 2),
224 Err(SymbolizeError::ConstantInput)
225 );
226 }
227
228 #[test]
229 fn alphabet_size_counts_distinct() {
230 let syms = vec![0u8, 1, 2, 1, 0, 3];
231 assert_eq!(alphabet_size(&syms), 4);
232 }
233
234 #[test]
235 fn alphabet_size_single_symbol() {
236 let syms = vec![7u8; 100];
237 assert_eq!(alphabet_size(&syms), 1);
238 }
239}
240
241#[derive(Debug, Clone)]
261pub struct WordSymbolizer {
262 num_symbols: usize,
263}
264
265impl WordSymbolizer {
266 #[must_use]
272 pub fn new(num_symbols: usize) -> Self {
273 assert!(num_symbols >= 2, "num_symbols must be ≥ 2");
274 Self { num_symbols }
275 }
276
277 pub fn symbolize(&self, text: &str) -> Result<Vec<u8>, SymbolizeError> {
290 let words: Vec<&str> = text.split_whitespace().collect();
292 if words.is_empty() {
293 return Err(SymbolizeError::EmptyInput);
294 }
295
296 let mut freq_map = std::collections::HashMap::new();
298 for &word in &words {
299 *freq_map.entry(word).or_insert(0usize) += 1;
300 }
301
302 let mut freqs: Vec<usize> = freq_map.values().copied().collect();
304 freqs.sort_unstable();
305
306 let n = freqs.len();
308 let cuts: Vec<usize> = (1..self.num_symbols)
309 .map(|k| {
310 let idx = (k * n / self.num_symbols).min(n - 1);
311 freqs[idx]
312 })
313 .collect();
314
315 let symbols: Vec<u8> = words
317 .iter()
318 .map(|&word| {
319 let freq = freq_map[word];
320 let sym = cuts.partition_point(|&cut| freq >= cut) as u8;
321 sym.min(self.num_symbols as u8 - 1)
322 })
323 .collect();
324
325 Ok(symbols)
326 }
327}
328
329#[cfg(test)]
330mod word_tests {
331 use super::*;
332
333 #[test]
334 fn word_symbolizer_basic() {
335 let text = "the quick brown fox jumps over the lazy dog the fox";
336 let symbolizer = WordSymbolizer::new(4);
337 let symbols = symbolizer.symbolize(text).unwrap();
338
339 assert_eq!(symbols.len(), 11);
342
343 let alpha_size = alphabet_size(&symbols);
345 assert!(alpha_size >= 2 && alpha_size <= 4);
346 }
347
348 #[test]
349 fn word_symbolizer_repetitive_text() {
350 let text = "home about blog contact home about blog contact";
352 let symbolizer = WordSymbolizer::new(4);
353 let symbols = symbolizer.symbolize(text).unwrap();
354
355 let alpha_size = alphabet_size(&symbols);
357 assert_eq!(alpha_size, 1, "repetitive text should collapse to single symbol");
358 }
359
360 #[test]
361 fn word_symbolizer_empty_error() {
362 let symbolizer = WordSymbolizer::new(4);
363 assert_eq!(symbolizer.symbolize(""), Err(SymbolizeError::EmptyInput));
364 assert_eq!(symbolizer.symbolize(" "), Err(SymbolizeError::EmptyInput));
365 }
366
367 #[test]
368 fn word_symbolizer_single_word() {
369 let symbolizer = WordSymbolizer::new(4);
370 let symbols = symbolizer.symbolize("hello").unwrap();
371 assert_eq!(symbols.len(), 1);
372 assert_eq!(alphabet_size(&symbols), 1);
373 }
374
375 #[test]
376 fn word_symbolizer_diverse_text() {
377 let text = "alpha beta gamma delta epsilon zeta eta theta";
379 let symbolizer = WordSymbolizer::new(4);
380 let symbols = symbolizer.symbolize(text).unwrap();
381
382 let alpha_size = alphabet_size(&symbols);
383 assert_eq!(alpha_size, 1, "uniform frequency should collapse to single symbol");
384 }
385
386 #[test]
387 #[should_panic(expected = "num_symbols must be ≥ 2")]
388 fn word_symbolizer_invalid_num_symbols() {
389 let _ = WordSymbolizer::new(1);
390 }
391}