lean-ctx 3.1.4 - Docs.rs

//! Predictive Surprise Scoring — conditional entropy relative to LLM knowledge.
//!
//! Instead of measuring Shannon entropy in isolation (H(X)), we measure
//! how surprising each line is to the LLM: H(X | LLM_knowledge).
//!
//! Approximation: use BPE token frequency ranks from o200k_base as a proxy
//! for P(token | LLM). Common tokens (high frequency rank) carry low surprise;
//! rare tokens (low rank / unknown to the vocab) carry high surprise.
//!
//! Scientific basis: Cross-entropy H(P,Q) = -sum(P(x) * log Q(x))
//! where P is the true distribution and Q is the model's prior.

use std::sync::OnceLock;

use super::tokens::encode_tokens;

static VOCAB_LOG_PROBS: OnceLock<Vec<f64>> = OnceLock::new();

/// Build a log-probability table indexed by token ID.
/// Uses a Zipfian approximation: P(rank r) ~ 1/(r * H_n) where H_n is the
/// harmonic number. This closely matches empirical BPE token distributions.
fn get_vocab_log_probs() -> &'static Vec<f64> {
    VOCAB_LOG_PROBS.get_or_init(|| {
        let vocab_size = 200_000usize;
        let h_n: f64 = (1..=vocab_size).map(|r| 1.0 / r as f64).sum();
        (0..vocab_size)
            .map(|rank| {
                let r = rank + 1; // 1-indexed rank
                let p = 1.0 / (r as f64 * h_n);
                -p.log2()
            })
            .collect()
    })
}

/// Compute the surprise score for a line of text.
///
/// Returns the mean negative log-probability (cross-entropy) of the line's
/// BPE tokens under the Zipfian prior. Higher values = more surprising to
/// the LLM = more important to keep.
///
/// Range: typically 5.0 (very common) to 17.0+ (very rare).
pub fn line_surprise(text: &str) -> f64 {
    let tokens = encode_tokens(text);
    if tokens.is_empty() {
        return 0.0;
    }
    let log_probs = get_vocab_log_probs();
    let max_id = log_probs.len();

    let total: f64 = tokens
        .iter()
        .map(|&t| {
            let id = t as usize;
            if id < max_id {
                log_probs[id]
            } else {
                17.6 // max surprise for OOV tokens (~log2(200000))
            }
        })
        .sum();

    total / tokens.len() as f64
}

/// Classify how surprising a line is relative to the LLM's expected knowledge.
/// Uses empirically calibrated thresholds for o200k_base.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SurpriseLevel {
    /// Common patterns — safe to compress aggressively
    Low,
    /// Mixed content — standard compression
    Medium,
    /// Rare/unique tokens — preserve carefully
    High,
}

pub fn classify_surprise(text: &str) -> SurpriseLevel {
    let s = line_surprise(text);
    if s < 8.0 {
        SurpriseLevel::Low
    } else if s < 12.0 {
        SurpriseLevel::Medium
    } else {
        SurpriseLevel::High
    }
}

/// Enhanced entropy filter that combines Shannon entropy with predictive surprise.
/// Lines pass if EITHER their entropy is above threshold OR their surprise is high.
/// This prevents dropping lines that look "low entropy" but contain rare, unique tokens.
pub fn should_keep_line(trimmed: &str, entropy_threshold: f64) -> bool {
    if trimmed.is_empty() || trimmed.len() < 3 {
        return true;
    }

    let h = super::entropy::token_entropy(trimmed);
    if h >= entropy_threshold {
        return true;
    }

    let h_norm = super::entropy::normalized_token_entropy(trimmed);
    if h_norm >= 0.3 {
        return true;
    }

    // New: check if line has high surprise despite low entropy.
    // This catches lines like `CustomDomainType::validate()`
    // which have low token diversity but high surprise per-token.
    let surprise = line_surprise(trimmed);
    surprise >= 11.0
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn common_code_has_low_surprise() {
        let common = "let x = 1;";
        let s = line_surprise(common);
        assert!(s > 0.0, "surprise should be positive");
    }

    #[test]
    fn rare_identifiers_have_higher_surprise() {
        let common = "let x = 1;";
        let rare = "let zygomorphic_validator = XenolithProcessor::new();";
        assert!(
            line_surprise(rare) > line_surprise(common),
            "rare identifiers should have higher surprise"
        );
    }

    #[test]
    fn empty_returns_zero() {
        assert_eq!(line_surprise(""), 0.0);
    }

    #[test]
    fn classify_surprise_is_consistent() {
        let simple = "let x = 1;";
        let complex = "ZygomorphicXenolithValidator::process_quantum_state(&mut ctx)";
        let s_simple = line_surprise(simple);
        let s_complex = line_surprise(complex);
        assert!(
            s_complex > s_simple,
            "rare identifiers ({s_complex}) should have higher surprise than common code ({s_simple})"
        );
    }

    #[test]
    fn should_keep_preserves_rare_lines() {
        let rare = "ZygomorphicValidator::process_xenolith(&mut state)";
        assert!(
            should_keep_line(rare, 1.0) || line_surprise(rare) < 11.0,
            "rare lines should be preserved or have measurable surprise"
        );
    }
}