epsilon-engine 0.1.0

//! Pillar: III. PACR field: Γ (symbol alphabet for ε-machine construction).
//!
//! Converts continuous time-series data or text into discrete symbol sequences.
//! Three strategies are provided:
//!
//! | Strategy        | Input type | Binning method          | Use case                        |
//! |-----------------|------------|-------------------------|---------------------------------|
//! | `EqualWidth`    | f64 stream | Uniform value intervals | Known bounded range             |
//! | `EqualFrequency`| f64 stream | Uniform sample quantiles| Unknown distribution (default)  |
//! | `WordSymbolizer`| Text       | Word-level frequency    | Semantic redundancy detection   |
//!
//! # Alphabet size
//!
//! The alphabet size `|A|` trades bias for variance:
//! - Small `|A|` (2–4): under-resolves structure, misses fine-grained patterns.
//! - Large `|A|` (8–16): more states but requires larger N for statistical power.
//! - Recommended: `|A|` = 4–8 for N < 50 000 samples.
//!
//! # WordSymbolizer
//!
//! For text input, `WordSymbolizer` tokenizes on whitespace and bins words by frequency.
//! This detects semantic redundancy (e.g., "Home About Blog Contact" → low S_T)
//! that byte-level symbolizers miss.

/// Error type for symbolization failures.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum SymbolizeError {
    /// The input data slice is empty.
    EmptyInput,
    /// `num_symbols` must be ≥ 2.
    TooFewSymbols,
    /// All values are identical — cannot bin with equal-frequency.
    ConstantInput,
}

impl std::fmt::Display for SymbolizeError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::EmptyInput => write!(f, "input data is empty"),
            Self::TooFewSymbols => write!(f, "num_symbols must be ≥ 2"),
            Self::ConstantInput => write!(f, "all values are identical; cannot bin"),
        }
    }
}

impl std::error::Error for SymbolizeError {}

// ── Equal-Width Quantization ──────────────────────────────────────────────────

/// Quantize `data` into `num_symbols` equal-width bins.
///
/// Bin boundaries are `[min, min + w, min + 2w, ...]` where `w = (max - min) / n`.
/// Symbols are `0 ..= num_symbols - 1`.
///
/// # Errors
///
/// Returns [`SymbolizeError`] if `data` is empty, `num_symbols < 2`,
/// or all values are identical (zero-width bins).
pub fn equal_width(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
    if data.is_empty() {
        return Err(SymbolizeError::EmptyInput);
    }
    if num_symbols < 2 {
        return Err(SymbolizeError::TooFewSymbols);
    }

    let min = data.iter().copied().fold(f64::INFINITY, f64::min);
    let max = data.iter().copied().fold(f64::NEG_INFINITY, f64::max);

    if (max - min).abs() < f64::EPSILON {
        return Err(SymbolizeError::ConstantInput);
    }

    let width = (max - min) / num_symbols as f64;
    let n_sym = num_symbols as u8;

    let symbols = data
        .iter()
        .map(|&v| {
            let bin = ((v - min) / width).floor() as u8;
            bin.min(n_sym - 1) // clamp the max value into the last bin
        })
        .collect();

    Ok(symbols)
}

// ── Equal-Frequency Quantization ─────────────────────────────────────────────

/// Quantize `data` into `num_symbols` equal-frequency (quantile) bins.
///
/// Bin boundaries are chosen so each bin contains approximately the same
/// number of samples.  Symbol `k` maps to the `k`-th quantile interval.
///
/// This is the recommended strategy for CSSR: it maximises entropy in the
/// symbolic sequence, giving the suffix tree more discriminating power.
///
/// # Errors
///
/// Returns [`SymbolizeError`] if `data` is empty, `num_symbols < 2`,
/// or all values are identical.
pub fn equal_frequency(data: &[f64], num_symbols: usize) -> Result<Vec<u8>, SymbolizeError> {
    if data.is_empty() {
        return Err(SymbolizeError::EmptyInput);
    }
    if num_symbols < 2 {
        return Err(SymbolizeError::TooFewSymbols);
    }

    // Sort indices to find quantile boundaries.
    let mut sorted: Vec<f64> = data.to_vec();
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

    let first = sorted[0];
    let last = sorted[sorted.len() - 1];
    if (last - first).abs() < f64::EPSILON {
        return Err(SymbolizeError::ConstantInput);
    }

    // Compute `num_symbols - 1` cut-points at quantile boundaries.
    // Cut k: sorted[floor(k * N / num_symbols)] for k = 1..num_symbols-1.
    let n = sorted.len();
    let cuts: Vec<f64> = (1..num_symbols)
        .map(|k| {
            let idx = (k * n / num_symbols).min(n - 1);
            sorted[idx]
        })
        .collect();

    let symbols = data
        .iter()
        .map(|&v| {
            // Binary search for the first cut > v → symbol index.
            let sym = cuts.partition_point(|&cut| v >= cut) as u8;
            sym.min(num_symbols as u8 - 1)
        })
        .collect();

    Ok(symbols)
}

// ── Utility: alphabet size inference ──────────────────────────────────────────

/// Returns the number of distinct symbols actually used in the sequence.
///
/// This may be less than the requested `num_symbols` when the data range
/// is narrower than expected.
#[must_use]
pub fn alphabet_size(symbols: &[u8]) -> usize {
    let mut seen = [false; 256];
    for &s in symbols {
        seen[s as usize] = true;
    }
    seen.iter().filter(|&&b| b).count()
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn equal_width_basic() {
        let data = vec![0.0, 1.0, 2.0, 3.0];
        let syms = equal_width(&data, 4).unwrap();
        // Each value maps to its own bin.
        assert_eq!(syms, vec![0, 1, 2, 3]);
    }

    #[test]
    fn equal_width_clamps_max() {
        // Exactly max value must map to last bin, not overflow.
        let data = vec![0.0, 0.5, 1.0];
        let syms = equal_width(&data, 2).unwrap();
        // [0.0, 0.5) → 0; [0.5, 1.0] → 1 (clamped)
        assert_eq!(syms[2], 1, "max value must map to last bin");
    }

    #[test]
    fn equal_frequency_distributes_evenly() {
        // 100 values uniform → 4 bins each with ~25 elements.
        let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
        let syms = equal_frequency(&data, 4).unwrap();
        let mut counts = [0usize; 4];
        for &s in &syms {
            counts[s as usize] += 1;
        }
        // Each bin should have ~25; allow ±2 slack.
        for c in counts {
            assert!(
                (23..=27).contains(&c),
                "bin count {c} out of expected range"
            );
        }
    }

    #[test]
    fn equal_width_empty_error() {
        assert_eq!(equal_width(&[], 4), Err(SymbolizeError::EmptyInput));
    }

    #[test]
    fn equal_width_few_symbols_error() {
        assert_eq!(
            equal_width(&[1.0, 2.0], 1),
            Err(SymbolizeError::TooFewSymbols)
        );
    }

    #[test]
    fn equal_width_constant_error() {
        assert_eq!(
            equal_width(&[5.0, 5.0, 5.0], 4),
            Err(SymbolizeError::ConstantInput)
        );
    }

    #[test]
    fn equal_frequency_constant_error() {
        assert_eq!(
            equal_frequency(&[3.0, 3.0], 2),
            Err(SymbolizeError::ConstantInput)
        );
    }

    #[test]
    fn alphabet_size_counts_distinct() {
        let syms = vec![0u8, 1, 2, 1, 0, 3];
        assert_eq!(alphabet_size(&syms), 4);
    }

    #[test]
    fn alphabet_size_single_symbol() {
        let syms = vec![7u8; 100];
        assert_eq!(alphabet_size(&syms), 1);
    }
}

// ── Word-Level Symbolizer ─────────────────────────────────────────────────────

/// Symbolizer for text input using word-level frequency binning.
///
/// Tokenizes on whitespace, computes word frequencies, and bins words into
/// `num_symbols` frequency tiers. This detects semantic redundancy that
/// byte-level symbolizers miss (e.g., "Home About Blog Contact" has low S_T).
///
/// # Example
///
/// ```
/// use epsilon_engine::symbolize::WordSymbolizer;
///
/// let text = "the quick brown fox jumps over the lazy dog the fox";
/// let symbolizer = WordSymbolizer::new(4);
/// let symbols = symbolizer.symbolize(text).unwrap();
/// // "the" (3×) and "fox" (2×) map to high-frequency bins
/// // "quick", "brown", etc. (1×) map to low-frequency bins
/// ```
#[derive(Debug, Clone)]
pub struct WordSymbolizer {
    num_symbols: usize,
}

impl WordSymbolizer {
    /// Create a new word symbolizer with `num_symbols` frequency bins.
    ///
    /// # Panics
    ///
    /// Panics if `num_symbols < 2`.
    #[must_use]
    pub fn new(num_symbols: usize) -> Self {
        assert!(num_symbols >= 2, "num_symbols must be ≥ 2");
        Self { num_symbols }
    }

    /// Symbolize text into a sequence of frequency-binned symbols.
    ///
    /// # Algorithm
    ///
    /// 1. Tokenize on whitespace (split by `char::is_whitespace`)
    /// 2. Count word frequencies
    /// 3. Bin words into `num_symbols` equal-frequency tiers
    /// 4. Map each word occurrence to its tier symbol
    ///
    /// # Errors
    ///
    /// Returns [`SymbolizeError::EmptyInput`] if text contains no words.
    pub fn symbolize(&self, text: &str) -> Result<Vec<u8>, SymbolizeError> {
        // Tokenize on whitespace
        let words: Vec<&str> = text.split_whitespace().collect();
        if words.is_empty() {
            return Err(SymbolizeError::EmptyInput);
        }

        // Count word frequencies
        let mut freq_map = std::collections::HashMap::new();
        for &word in &words {
            *freq_map.entry(word).or_insert(0usize) += 1;
        }

        // Extract frequencies and sort for quantile binning
        let mut freqs: Vec<usize> = freq_map.values().copied().collect();
        freqs.sort_unstable();

        // Compute frequency quantile boundaries
        let n = freqs.len();
        let cuts: Vec<usize> = (1..self.num_symbols)
            .map(|k| {
                let idx = (k * n / self.num_symbols).min(n - 1);
                freqs[idx]
            })
            .collect();

        // Map each word to its frequency tier
        let symbols: Vec<u8> = words
            .iter()
            .map(|&word| {
                let freq = freq_map[word];
                let sym = cuts.partition_point(|&cut| freq >= cut) as u8;
                sym.min(self.num_symbols as u8 - 1)
            })
            .collect();

        Ok(symbols)
    }
}

#[cfg(test)]
mod word_tests {
    use super::*;

    #[test]
    fn word_symbolizer_basic() {
        let text = "the quick brown fox jumps over the lazy dog the fox";
        let symbolizer = WordSymbolizer::new(4);
        let symbols = symbolizer.symbolize(text).unwrap();

        // "the" appears 3×, "fox" appears 2×, others appear 1×
        // With 4 bins, expect high-frequency words to map to higher symbols
        assert_eq!(symbols.len(), 11);

        // Verify alphabet size is reasonable
        let alpha_size = alphabet_size(&symbols);
        assert!(alpha_size >= 2 && alpha_size <= 4);
    }

    #[test]
    fn word_symbolizer_repetitive_text() {
        // Highly repetitive text should have low entropy
        let text = "home about blog contact home about blog contact";
        let symbolizer = WordSymbolizer::new(4);
        let symbols = symbolizer.symbolize(text).unwrap();

        // All words appear 2×, so should map to same frequency bin
        let alpha_size = alphabet_size(&symbols);
        assert_eq!(alpha_size, 1, "repetitive text should collapse to single symbol");
    }

    #[test]
    fn word_symbolizer_empty_error() {
        let symbolizer = WordSymbolizer::new(4);
        assert_eq!(symbolizer.symbolize(""), Err(SymbolizeError::EmptyInput));
        assert_eq!(symbolizer.symbolize("   "), Err(SymbolizeError::EmptyInput));
    }

    #[test]
    fn word_symbolizer_single_word() {
        let symbolizer = WordSymbolizer::new(4);
        let symbols = symbolizer.symbolize("hello").unwrap();
        assert_eq!(symbols.len(), 1);
        assert_eq!(alphabet_size(&symbols), 1);
    }

    #[test]
    fn word_symbolizer_diverse_text() {
        // Each word appears once → all same frequency → single symbol
        let text = "alpha beta gamma delta epsilon zeta eta theta";
        let symbolizer = WordSymbolizer::new(4);
        let symbols = symbolizer.symbolize(text).unwrap();

        let alpha_size = alphabet_size(&symbols);
        assert_eq!(alpha_size, 1, "uniform frequency should collapse to single symbol");
    }

    #[test]
    #[should_panic(expected = "num_symbols must be ≥ 2")]
    fn word_symbolizer_invalid_num_symbols() {
        let _ = WordSymbolizer::new(1);
    }
}