Skip to main content

epsilon_engine/
symbolize.rs

1//! Pillar: III. PACR field: Γ (symbol alphabet for ε-machine construction).
2//!
3//! Converts continuous time-series data or text into discrete symbol sequences.
4//! Three strategies are provided:
5//!
6//! | Strategy        | Input type | Binning method          | Use case                        |
7//! |-----------------|------------|-------------------------|---------------------------------|
8//! | `EqualWidth`    | f64 stream | Uniform value intervals | Known bounded range             |
9//! | `EqualFrequency`| f64 stream | Uniform sample quantiles| Unknown distribution (default)  |
10//! | `WordSymbolizer`| Text       | Word-level frequency    | Semantic redundancy detection   |
11//!
12//! # Alphabet size
13//!
14//! The alphabet size `|A|` trades bias for variance:
15//! - Small `|A|` (2–4): under-resolves structure, misses fine-grained patterns.
16//! - Large `|A|` (8–16): more states but requires larger N for statistical power.
17//! - Recommended: `|A|` = 4–8 for N < 50 000 samples.
18//!
19//! # WordSymbolizer
20//!
21//! For text input, `WordSymbolizer` tokenizes on whitespace and bins words by frequency.
22//! This detects semantic redundancy (e.g., "Home About Blog Contact" → low S_T)
23//! that byte-level symbolizers miss.
24
25/// Error type for symbolization failures.
26#[derive(Debug, Clone, PartialEq, Eq)]
27#[non_exhaustive]
28pub enum SymbolizeError {
29    /// The input data slice is empty.
30    EmptyInput,
31    /// `num_symbols` must be ≥ 2.
32    TooFewSymbols,
33    /// All values are identical — cannot bin with equal-frequency.
34    ConstantInput,
35}
36
37impl std::fmt::Display for SymbolizeError {
38    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
39        match self {
40            Self::EmptyInput => write!(f, "input data is empty"),
41            Self::TooFewSymbols => write!(f, "num_symbols must be ≥ 2"),
42            Self::ConstantInput => write!(f, "all values are identical; cannot bin"),
43        }
44    }
45}
46
47impl std::error::Error for SymbolizeError {}
48
49// ── Equal-Width Quantization ──────────────────────────────────────────────────
50
51/// Quantize `data` into `num_symbols` equal-width bins.
52///
53/// Bin boundaries are `[min, min + w, min + 2w, ...]` where `w = (max - min) / n`.
54/// Symbols are `0 ..= num_symbols - 1`.
55///
56/// # Errors
57///
58/// Returns [`SymbolizeError`] if `data` is empty, `num_symbols < 2`,
59/// or all values are identical (zero-width bins).
60pub fn equal_width(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
61    if data.is_empty() {
62        return Err(SymbolizeError::EmptyInput);
63    }
64    if num_symbols < 2 {
65        return Err(SymbolizeError::TooFewSymbols);
66    }
67
68    let min = data.iter().copied().fold(f64::INFINITY, f64::min);
69    let max = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);
70
71    if (max - min).abs() < f64::EPSILON {
72        return Err(SymbolizeError::ConstantInput);
73    }
74
75    let width = (max - min) / num_symbols as f64;
76    let n_sym = num_symbols as u8;
77
78    let symbols = data
79        .iter()
80        .map(|&v| {
81            let bin = ((v - min) / width).floor() as u8;
82            bin.min(n_sym - 1) // clamp the max value into the last bin
83        })
84        .collect();
85
86    Ok(symbols)
87}
88
89// ── Equal-Frequency Quantization ─────────────────────────────────────────────
90
91/// Quantize `data` into `num_symbols` equal-frequency (quantile) bins.
92///
93/// Bin boundaries are chosen so each bin contains approximately the same
94/// number of samples.  Symbol `k` maps to the `k`-th quantile interval.
95///
96/// This is the recommended strategy for CSSR: it maximises entropy in the
97/// symbolic sequence, giving the suffix tree more discriminating power.
98///
99/// # Errors
100///
101/// Returns [`SymbolizeError`] if `data` is empty, `num_symbols < 2`,
102/// or all values are identical.
103pub fn equal_frequency(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
104    if data.is_empty() {
105        return Err(SymbolizeError::EmptyInput);
106    }
107    if num_symbols < 2 {
108        return Err(SymbolizeError::TooFewSymbols);
109    }
110
111    // Sort indices to find quantile boundaries.
112    let mut sorted: Vec<f64> = data.to_vec();
113    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
114
115    let first = sorted[0];
116    let last = sorted[sorted.len() - 1];
117    if (last - first).abs() < f64::EPSILON {
118        return Err(SymbolizeError::ConstantInput);
119    }
120
121    // Compute `num_symbols - 1` cut-points at quantile boundaries.
122    // Cut k: sorted[floor(k * N / num_symbols)] for k = 1..num_symbols-1.
123    let n = sorted.len();
124    let cuts: Vec<f64> = (1..num_symbols)
125        .map(|k| {
126            let idx = (k * n / num_symbols).min(n - 1);
127            sorted[idx]
128        })
129        .collect();
130
131    let symbols = data
132        .iter()
133        .map(|&v| {
134            // Binary search for the first cut > v → symbol index.
135            let sym = cuts.partition_point(|&cut| v >= cut) as u8;
136            sym.min(num_symbols as u8 - 1)
137        })
138        .collect();
139
140    Ok(symbols)
141}
142
143// ── Utility: alphabet size inference ──────────────────────────────────────────
144
145/// Returns the number of distinct symbols actually used in the sequence.
146///
147/// This may be less than the requested `num_symbols` when the data range
148/// is narrower than expected.
149#[must_use]
150pub fn alphabet_size(symbols: &[u8]) -> usize {
151    let mut seen = [false; 256];
152    for &s in symbols {
153        seen[s as usize] = true;
154    }
155    seen.iter().filter(|&&b| b).count()
156}
157
158// ── Tests ─────────────────────────────────────────────────────────────────────
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn equal_width_basic() {
166        let data = vec![0.0, 1.0, 2.0, 3.0];
167        let syms = equal_width(&data, 4).unwrap();
168        // Each value maps to its own bin.
169        assert_eq!(syms, vec![0, 1, 2, 3]);
170    }
171
172    #[test]
173    fn equal_width_clamps_max() {
174        // Exactly max value must map to last bin, not overflow.
175        let data = vec![0.0, 0.5, 1.0];
176        let syms = equal_width(&data, 2).unwrap();
177        // [0.0, 0.5) → 0; [0.5, 1.0] → 1 (clamped)
178        assert_eq!(syms[2], 1, "max value must map to last bin");
179    }
180
181    #[test]
182    fn equal_frequency_distributes_evenly() {
183        // 100 values uniform → 4 bins each with ~25 elements.
184        let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
185        let syms = equal_frequency(&data, 4).unwrap();
186        let mut counts = [0usize; 4];
187        for &s in &syms {
188            counts[s as usize] += 1;
189        }
190        // Each bin should have ~25; allow ±2 slack.
191        for c in counts {
192            assert!(
193                (23..=27).contains(&c),
194                "bin count {c} out of expected range"
195            );
196        }
197    }
198
199    #[test]
200    fn equal_width_empty_error() {
201        assert_eq!(equal_width(&[], 4), Err(SymbolizeError::EmptyInput));
202    }
203
204    #[test]
205    fn equal_width_few_symbols_error() {
206        assert_eq!(
207            equal_width(&[1.0, 2.0], 1),
208            Err(SymbolizeError::TooFewSymbols)
209        );
210    }
211
212    #[test]
213    fn equal_width_constant_error() {
214        assert_eq!(
215            equal_width(&[5.0, 5.0, 5.0], 4),
216            Err(SymbolizeError::ConstantInput)
217        );
218    }
219
220    #[test]
221    fn equal_frequency_constant_error() {
222        assert_eq!(
223            equal_frequency(&[3.0, 3.0], 2),
224            Err(SymbolizeError::ConstantInput)
225        );
226    }
227
228    #[test]
229    fn alphabet_size_counts_distinct() {
230        let syms = vec![0u8, 1, 2, 1, 0, 3];
231        assert_eq!(alphabet_size(&syms), 4);
232    }
233
234    #[test]
235    fn alphabet_size_single_symbol() {
236        let syms = vec![7u8; 100];
237        assert_eq!(alphabet_size(&syms), 1);
238    }
239}
240
241// ── Word-Level Symbolizer ─────────────────────────────────────────────────────
242
243/// Symbolizer for text input using word-level frequency binning.
244///
245/// Tokenizes on whitespace, computes word frequencies, and bins words into
246/// `num_symbols` frequency tiers. This detects semantic redundancy that
247/// byte-level symbolizers miss (e.g., "Home About Blog Contact" has low S_T).
248///
249/// # Example
250///
251/// ```
252/// use epsilon_engine::symbolize::WordSymbolizer;
253///
254/// let text = "the quick brown fox jumps over the lazy dog the fox";
255/// let symbolizer = WordSymbolizer::new(4);
256/// let symbols = symbolizer.symbolize(text).unwrap();
257/// // "the" (3×) and "fox" (2×) map to high-frequency bins
258/// // "quick", "brown", etc. (1×) map to low-frequency bins
259/// ```
260#[derive(Debug, Clone)]
261pub struct WordSymbolizer {
262    num_symbols: usize,
263}
264
265impl WordSymbolizer {
266    /// Create a new word symbolizer with `num_symbols` frequency bins.
267    ///
268    /// # Panics
269    ///
270    /// Panics if `num_symbols < 2`.
271    #[must_use]
272    pub fn new(num_symbols: usize) -> Self {
273        assert!(num_symbols >= 2, "num_symbols must be ≥ 2");
274        Self { num_symbols }
275    }
276
277    /// Symbolize text into a sequence of frequency-binned symbols.
278    ///
279    /// # Algorithm
280    ///
281    /// 1. Tokenize on whitespace (split by `char::is_whitespace`)
282    /// 2. Count word frequencies
283    /// 3. Bin words into `num_symbols` equal-frequency tiers
284    /// 4. Map each word occurrence to its tier symbol
285    ///
286    /// # Errors
287    ///
288    /// Returns [`SymbolizeError::EmptyInput`] if text contains no words.
289    pub fn symbolize(&self, text: &str) -> Result<Vec<u8>, SymbolizeError> {
290        // Tokenize on whitespace
291        let words: Vec<&str> = text.split_whitespace().collect();
292        if words.is_empty() {
293            return Err(SymbolizeError::EmptyInput);
294        }
295
296        // Count word frequencies
297        let mut freq_map = std::collections::HashMap::new();
298        for &word in &words {
299            *freq_map.entry(word).or_insert(0usize) += 1;
300        }
301
302        // Extract frequencies and sort for quantile binning
303        let mut freqs: Vec<usize> = freq_map.values().copied().collect();
304        freqs.sort_unstable();
305
306        // Compute frequency quantile boundaries
307        let n = freqs.len();
308        let cuts: Vec<usize> = (1..self.num_symbols)
309            .map(|k| {
310                let idx = (k * n / self.num_symbols).min(n - 1);
311                freqs[idx]
312            })
313            .collect();
314
315        // Map each word to its frequency tier
316        let symbols: Vec<u8> = words
317            .iter()
318            .map(|&word| {
319                let freq = freq_map[word];
320                let sym = cuts.partition_point(|&cut| freq >= cut) as u8;
321                sym.min(self.num_symbols as u8 - 1)
322            })
323            .collect();
324
325        Ok(symbols)
326    }
327}
328
329#[cfg(test)]
330mod word_tests {
331    use super::*;
332
333    #[test]
334    fn word_symbolizer_basic() {
335        let text = "the quick brown fox jumps over the lazy dog the fox";
336        let symbolizer = WordSymbolizer::new(4);
337        let symbols = symbolizer.symbolize(text).unwrap();
338
339        // "the" appears 3×, "fox" appears 2×, others appear 1×
340        // With 4 bins, expect high-frequency words to map to higher symbols
341        assert_eq!(symbols.len(), 11);
342
343        // Verify alphabet size is reasonable
344        let alpha_size = alphabet_size(&symbols);
345        assert!(alpha_size >= 2 && alpha_size <= 4);
346    }
347
348    #[test]
349    fn word_symbolizer_repetitive_text() {
350        // Highly repetitive text should have low entropy
351        let text = "home about blog contact home about blog contact";
352        let symbolizer = WordSymbolizer::new(4);
353        let symbols = symbolizer.symbolize(text).unwrap();
354
355        // All words appear 2×, so should map to same frequency bin
356        let alpha_size = alphabet_size(&symbols);
357        assert_eq!(alpha_size, 1, "repetitive text should collapse to single symbol");
358    }
359
360    #[test]
361    fn word_symbolizer_empty_error() {
362        let symbolizer = WordSymbolizer::new(4);
363        assert_eq!(symbolizer.symbolize(""), Err(SymbolizeError::EmptyInput));
364        assert_eq!(symbolizer.symbolize("   "), Err(SymbolizeError::EmptyInput));
365    }
366
367    #[test]
368    fn word_symbolizer_single_word() {
369        let symbolizer = WordSymbolizer::new(4);
370        let symbols = symbolizer.symbolize("hello").unwrap();
371        assert_eq!(symbols.len(), 1);
372        assert_eq!(alphabet_size(&symbols), 1);
373    }
374
375    #[test]
376    fn word_symbolizer_diverse_text() {
377        // Each word appears once → all same frequency → single symbol
378        let text = "alpha beta gamma delta epsilon zeta eta theta";
379        let symbolizer = WordSymbolizer::new(4);
380        let symbols = symbolizer.symbolize(text).unwrap();
381
382        let alpha_size = alphabet_size(&symbols);
383        assert_eq!(alpha_size, 1, "uniform frequency should collapse to single symbol");
384    }
385
386    #[test]
387    #[should_panic(expected = "num_symbols must be ≥ 2")]
388    fn word_symbolizer_invalid_num_symbols() {
389        let _ = WordSymbolizer::new(1);
390    }
391}