ai_tokenopt 0.5.10

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Token-aware RAG entry ranking and fitting.
//!
//! Ranks RAG entries by a composite score (relevance, freshness,
//! token-efficiency) and greedily selects entries that fit within
//! the allocated token budget.

use crate::estimator::TokenEstimator;
use crate::prompt::rag_dedup::RagEntry;

/// A ranked RAG entry with its composite score.
#[derive(Debug, Clone)]
pub struct RankedEntry {
    /// The original RAG entry
    pub entry: RagEntry,
    /// Composite ranking score (0.0–1.0, higher = better)
    pub score: f64,
}

/// Result of ranking and fitting RAG entries.
#[derive(Debug)]
pub struct RankedRagResult {
    /// Entries selected to fit within the token budget
    pub entries: Vec<RankedEntry>,
    /// Number of entries excluded due to budget constraints
    pub excluded_count: usize,
    /// Total tokens consumed by selected entries
    pub total_tokens: u32,
}

/// Rank RAG entries by composite score and select those that fit in the budget.
///
/// # Scoring formula
///
/// Each entry is scored by: `relevance × 0.7 + token_efficiency × 0.3`
///
/// where `token_efficiency = 1.0 - (entry_tokens / budget_tokens)`.
/// This prefers entries that are both relevant AND compact.
///
/// # Arguments
///
/// * `entries` — Pre-deduplicated RAG entries
/// * `budget_tokens` — Maximum tokens available for RAG context
///
/// # Examples
///
/// ```
/// use ai_tokenopt::prompt::rag_ranker::rank_and_fit_rag;
/// use ai_tokenopt::prompt::rag_dedup::RagEntry;
///
/// let entries = vec![
///     RagEntry {
///         content: "Short relevant answer".to_string(),
///         relevance: 0.9,
///         embedding: None,
///     },
///     RagEntry {
///         content: "A very long entry that takes up a lot of space ".repeat(10),
///         relevance: 0.95,
///         embedding: None,
///     },
/// ];
/// let result = rank_and_fit_rag(&entries, 100);
/// assert!(!result.entries.is_empty());
/// ```
#[must_use]
pub fn rank_and_fit_rag(entries: &[RagEntry], budget_tokens: u32) -> RankedRagResult {
    if entries.is_empty() || budget_tokens == 0 {
        return RankedRagResult {
            entries: Vec::new(),
            excluded_count: entries.len(),
            total_tokens: 0,
        };
    }

    // Score each entry
    let mut scored: Vec<RankedEntry> = entries
        .iter()
        .map(|entry| {
            let entry_tokens = TokenEstimator::estimate_tokens(&entry.content);
            let token_efficiency = compute_token_efficiency(entry_tokens, budget_tokens);
            let score = token_efficiency.mul_add(0.3, f64::from(entry.relevance) * 0.7);

            RankedEntry {
                entry: entry.clone(),
                score,
            }
        })
        .collect();

    // Sort by score descending
    scored.sort_by(|a, b| {
        b.score
            .partial_cmp(&a.score)
            .unwrap_or(std::cmp::Ordering::Equal)
    });

    // Greedy selection within budget
    let mut selected: Vec<RankedEntry> = Vec::new();
    let mut total_tokens: u32 = 0;
    let mut excluded_count: usize = 0;

    for entry in scored {
        let entry_tokens = TokenEstimator::estimate_tokens(&entry.entry.content);
        if total_tokens + entry_tokens <= budget_tokens {
            total_tokens += entry_tokens;
            selected.push(entry);
        } else {
            excluded_count += 1;
        }
    }

    RankedRagResult {
        entries: selected,
        excluded_count,
        total_tokens,
    }
}

/// Compute a token efficiency score (0.0–1.0).
///
/// Smaller entries relative to budget get higher efficiency.
fn compute_token_efficiency(entry_tokens: u32, budget_tokens: u32) -> f64 {
    if budget_tokens == 0 {
        return 0.0;
    }
    let ratio = f64::from(entry_tokens) / f64::from(budget_tokens);
    (1.0 - ratio).max(0.0)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_entry(content: &str, relevance: f32) -> RagEntry {
        RagEntry {
            content: content.to_string(),
            relevance,
            embedding: None,
        }
    }

    #[test]
    fn empty_entries_returns_empty() {
        let result = rank_and_fit_rag(&[], 100);
        assert!(result.entries.is_empty());
        assert_eq!(result.excluded_count, 0);
    }

    #[test]
    fn zero_budget_excludes_all() {
        let entries = vec![make_entry("hello", 0.9)];
        let result = rank_and_fit_rag(&entries, 0);
        assert!(result.entries.is_empty());
        assert_eq!(result.excluded_count, 1);
    }

    #[test]
    fn single_entry_within_budget() {
        let entries = vec![make_entry("hello world", 0.9)];
        let result = rank_and_fit_rag(&entries, 100);
        assert_eq!(result.entries.len(), 1);
        assert_eq!(result.excluded_count, 0);
    }

    #[test]
    fn compact_relevant_preferred_over_long_slightly_more_relevant() {
        let entries = vec![
            make_entry("Short answer", 0.85),
            make_entry(&"Long verbose answer ".repeat(50), 0.90),
        ];
        // Small budget forces selection
        let result = rank_and_fit_rag(&entries, 20);
        assert_eq!(result.entries.len(), 1);
        assert!(result.entries[0].entry.content.contains("Short"));
    }

    #[test]
    fn high_relevance_wins_within_budget() {
        let entries = vec![
            make_entry("Less relevant entry", 0.5),
            make_entry("Very relevant entry", 0.99),
        ];
        let result = rank_and_fit_rag(&entries, 1000);
        // Both fit, but top-ranked should be the more relevant
        assert!(result.entries[0].score >= result.entries[1].score);
    }

    #[test]
    fn budget_constraint_respected() {
        let entries = vec![
            make_entry(&"word ".repeat(100), 0.9), // ~100 tokens
            make_entry(&"word ".repeat(100), 0.8), // ~100 tokens
        ];
        let result = rank_and_fit_rag(&entries, 50);
        // Neither fits in 50 tokens
        assert!(result.entries.is_empty() || result.total_tokens <= 50);
    }

    #[test]
    fn token_efficiency_correct() {
        assert!((compute_token_efficiency(10, 100) - 0.9).abs() < f64::EPSILON);
        assert!((compute_token_efficiency(50, 100) - 0.5).abs() < f64::EPSILON);
        assert!((compute_token_efficiency(100, 100)).abs() < f64::EPSILON);
    }

    #[test]
    fn total_tokens_tracked() {
        let entries = vec![
            make_entry("hello world", 0.9),
            make_entry("foo bar baz", 0.8),
        ];
        let result = rank_and_fit_rag(&entries, 1000);
        assert!(result.total_tokens > 0);
    }
}