ai_tokenopt 0.5.7

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Tiered summary re-summarization.
//!
//! When the rolling summary grows too large, this module compresses it
//! through two tiers:
//!
//! 1. **Algorithmic** (>70% of budget): Deduplicate topic segments, merge
//!    overlapping topics, remove redundant phrasing.
//! 2. **LLM** (>90% of budget): Use the `SummarizationPort` to produce
//!    a tighter summary capped at half the token budget.

use crate::estimator::TokenEstimator;
use crate::ports::SummarizationPort;

/// Pressure threshold for algorithmic deduplication (70%).
const ALGORITHMIC_THRESHOLD: f64 = 0.7;

/// Pressure threshold for LLM re-summarization (90%).
const LLM_THRESHOLD: f64 = 0.9;

/// Compact a rolling summary to fit within `max_tokens`.
///
/// Two tiers of compression are applied based on pressure:
///
/// - **Algorithmic** (pressure > 0.7): deduplicates `|`-delimited topic
///   segments via Jaccard word-set similarity and trims per-segment length.
/// - **LLM** (pressure > 0.9, `summarizer` provided): calls the LLM to
///   compress the entire summary to `max_tokens / 2`.
///
/// Returns the (possibly) compacted summary.
pub async fn compact_summary(
    summary: &str,
    max_tokens: u32,
    summarizer: Option<&dyn SummarizationPort>,
) -> String {
    if summary.is_empty() || max_tokens == 0 {
        return summary.to_string();
    }

    let current_tokens = TokenEstimator::estimate_tokens(summary);
    let pressure = f64::from(current_tokens) / f64::from(max_tokens).max(1.0);

    if pressure <= ALGORITHMIC_THRESHOLD {
        return summary.to_string();
    }

    // Tier 1: Algorithmic deduplication of topic segments
    let deduped = deduplicate_segments(summary);

    let deduped_tokens = TokenEstimator::estimate_tokens(&deduped);
    let new_pressure = f64::from(deduped_tokens) / f64::from(max_tokens).max(1.0);

    if new_pressure <= LLM_THRESHOLD {
        return deduped;
    }

    // Tier 2: LLM re-summarization
    if let Some(port) = summarizer {
        let target = max_tokens / 2;
        let prompt = format!(
            "Compress this conversation summary to approximately {target} tokens. \
             Preserve key facts, decisions, and context. Remove redundancies.\n\n{deduped}"
        );

        if let Ok(result) = port
            .summarize(
                "You are a precise summarizer. Output only the compressed summary.",
                &prompt,
            )
            .await
        {
            // Only use the LLM result if it's actually shorter
            let result_tokens = TokenEstimator::estimate_tokens(&result);
            if result_tokens < deduped_tokens {
                return result;
            }
        }
    }

    deduped
}

/// Deduplicate `|`-delimited topic segments by Jaccard word-set similarity.
///
/// Segments whose word-sets overlap > 85% are merged (the longer one is kept).
/// Returns the compacted string with unique segments.
fn deduplicate_segments(text: &str) -> String {
    let segments: Vec<&str> = text
        .split('|')
        .map(str::trim)
        .filter(|s| !s.is_empty())
        .collect();

    if segments.len() <= 1 {
        return text.to_string();
    }

    let word_sets: Vec<std::collections::HashSet<&str>> = segments
        .iter()
        .map(|s| {
            s.split_whitespace()
                .map(|w| w.trim_matches(|c: char| c.is_ascii_punctuation()))
                .filter(|w| w.len() >= 2)
                .collect()
        })
        .collect();

    let mut kept = vec![true; segments.len()];

    for i in 0..segments.len() {
        if !kept[i] {
            continue;
        }
        for j in (i + 1)..segments.len() {
            if !kept[j] {
                continue;
            }
            let similarity = jaccard(&word_sets[i], &word_sets[j]);
            if similarity > 0.85 {
                // Keep the longer segment, drop the shorter one
                if segments[i].len() >= segments[j].len() {
                    kept[j] = false;
                } else {
                    kept[i] = false;
                    break; // i is dropped, no need to compare further
                }
            }
        }
    }

    let result: Vec<&str> = segments
        .iter()
        .zip(kept.iter())
        .filter(|&(_, &k)| k)
        .map(|(&s, _)| s)
        .collect();

    result.join(" | ")
}

/// Jaccard similarity between two word sets (intersection / union).
fn jaccard(a: &std::collections::HashSet<&str>, b: &std::collections::HashSet<&str>) -> f64 {
    if a.is_empty() && b.is_empty() {
        return 1.0;
    }
    let intersection = a.intersection(b).count();
    let union = a.union(b).count();
    if union == 0 {
        return 0.0;
    }
    #[allow(clippy::cast_precision_loss)]
    let result = intersection as f64 / union as f64;
    result
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::error::TokenOptError;

    struct MockSummarizer;

    #[async_trait::async_trait]
    impl SummarizationPort for MockSummarizer {
        async fn summarize(
            &self,
            _system_prompt: &str,
            _text: &str,
        ) -> Result<String, TokenOptError> {
            Ok("Compressed summary.".to_string())
        }
    }

    #[test]
    fn dedup_removes_overlapping_segments() {
        // Segments with >85% word-set overlap should be merged.
        // Jaccard > 0.85 requires (n-1)/(n+1) > 0.85 ⟹ n >= 13.
        let text = "User asked about the current weather forecast conditions \
                    for Berlin Germany today with temperature | \
                    User asked about the current weather forecast conditions \
                    for Berlin Germany tonight with temperature | \
                    Rust programming language discussion";
        let result = deduplicate_segments(text);
        assert!(
            result.matches('|').count() < text.matches('|').count(),
            "Should have fewer segments after dedup: {result}"
        );
    }

    #[test]
    fn dedup_preserves_distinct_segments() {
        let text = "Weather in Berlin is sunny | Rust programming discussion | \
                    Calendar appointment for tomorrow";
        let result = deduplicate_segments(text);
        assert_eq!(
            result.matches('|').count(),
            2,
            "All distinct segments should be kept: {result}"
        );
    }

    #[test]
    fn dedup_handles_single_segment() {
        let text = "Just one topic here";
        let result = deduplicate_segments(text);
        assert_eq!(result, text);
    }

    #[test]
    fn dedup_handles_empty_string() {
        let result = deduplicate_segments("");
        assert_eq!(result, "");
    }

    #[test]
    fn jaccard_identical_sets() {
        let a: std::collections::HashSet<&str> = ["hello", "world"].into_iter().collect();
        let sim = jaccard(&a, &a);
        assert!((sim - 1.0).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_disjoint_sets() {
        let a: std::collections::HashSet<&str> = ["hello", "world"].into_iter().collect();
        let b: std::collections::HashSet<&str> = ["foo", "bar"].into_iter().collect();
        let sim = jaccard(&a, &b);
        assert!(sim.abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_empty_sets() {
        let a: std::collections::HashSet<&str> = std::collections::HashSet::new();
        let sim = jaccard(&a, &a);
        assert!((sim - 1.0).abs() < f64::EPSILON);
    }

    #[tokio::test]
    async fn compact_below_threshold_unchanged() {
        let summary = "Short summary";
        // max_tokens large enough that pressure is < 0.7
        let result = compact_summary(summary, 1000, None).await;
        assert_eq!(result, summary);
    }

    #[tokio::test]
    async fn compact_high_pressure_triggers_dedup() {
        // Create a long summary with repeated segments that will cause high pressure
        let summary = "Weather in Berlin is sunny today | \
                       Weather in Berlin is sunny and warm | \
                       User discussed Rust programming";
        // Set max_tokens low to trigger high pressure
        let result = compact_summary(summary, 10, None).await;
        assert!(
            result.len() <= summary.len(),
            "Compacted summary should not be longer"
        );
    }

    #[tokio::test]
    async fn compact_very_high_pressure_uses_llm() {
        let summary = "A very long summary with lots of content | \
                       More content about different things | \
                       Even more unique content here";
        let summarizer = MockSummarizer;
        let result = compact_summary(summary, 5, Some(&summarizer)).await;
        assert_eq!(result, "Compressed summary.");
    }

    #[tokio::test]
    async fn compact_empty_summary_unchanged() {
        let result = compact_summary("", 100, None).await;
        assert_eq!(result, "");
    }
}