ai_tokenopt 0.5.9

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Cross-turn RAG and knowledge graph context deduplication.
//!
//! Strips embedded RAG memory and knowledge graph context blocks from
//! messages older than a configurable turn decay window, keeping only
//! the most recent occurrences. This prevents the same context from
//! being repeated across many turns, saving significant tokens.

use crate::estimator::TokenEstimator;
use crate::types::{ChatMessage, MessageRole};

/// Default number of recent turns to preserve RAG/graph context in.
const DEFAULT_DECAY_WINDOW: usize = 5;

/// Marker that starts an embedded RAG memory block.
const RAG_MARKER: &str = "IMPORTANT — YOUR MEMORY:";

/// Marker that starts an embedded knowledge graph block.
const GRAPH_MARKER: &str = "YOUR KNOWLEDGE GRAPH:";

/// Remove embedded RAG and knowledge graph context from old messages.
///
/// For messages older than `decay_window` turns (measured from the end),
/// strips any RAG memory or knowledge graph context blocks. This prevents
/// the same context from being repeated across many turns.
///
/// # Arguments
///
/// * `messages` — Conversation messages to process (modified in place)
/// * `current_turn` — The current (latest) turn index
/// * `decay_window` — Number of recent turns to preserve context in
///
/// # Returns
///
/// The estimated tokens saved by stripping old context blocks.
pub fn deduplicate_rag_across_turns(
    messages: &mut [ChatMessage],
    current_turn: usize,
    decay_window: Option<usize>,
) -> u32 {
    let window = decay_window.unwrap_or(DEFAULT_DECAY_WINDOW);
    let mut tokens_saved = 0u32;

    for (i, msg) in messages.iter_mut().enumerate() {
        // Only process system messages (where RAG/graph context is injected)
        if msg.role != MessageRole::System {
            continue;
        }

        // Calculate turn age: messages at the end are the newest
        let turn_index = i / 2;
        let age = current_turn.saturating_sub(turn_index);
        if age < window {
            continue;
        }

        let before_tokens = TokenEstimator::estimate_tokens(&msg.content);
        msg.content = strip_context_blocks(&msg.content);
        let after_tokens = TokenEstimator::estimate_tokens(&msg.content);
        tokens_saved += before_tokens.saturating_sub(after_tokens);
    }

    tokens_saved
}

/// Strip RAG memory and knowledge graph blocks from a message's content.
///
/// Removes everything from the marker to the next double newline or end of string.
fn strip_context_blocks(content: &str) -> String {
    let mut result = content.to_string();
    result = strip_block(&result, RAG_MARKER);
    result = strip_block(&result, GRAPH_MARKER);

    // Clean up any leftover multiple blank lines
    while result.contains("\n\n\n") {
        result = result.replace("\n\n\n", "\n\n");
    }

    result.trim().to_string()
}

/// Strip a single context block starting at `marker` from the text.
///
/// Removes the entire block — header, preamble, AND all list entries —
/// by scanning forward to the next known section boundary or end of string.
/// Known boundaries: the other context marker, markdown headings (`## `).
fn strip_block(text: &str, marker: &str) -> String {
    let Some(start) = text.find(marker) else {
        return text.to_string();
    };

    let after_marker = start + marker.len();

    // Known section boundaries that signal the end of the current block.
    // Each section in the enriched prompt is preceded by "\n\n".
    let boundaries: &[&str] = &[RAG_MARKER, GRAPH_MARKER, "## "];

    let end = boundaries
        .iter()
        .filter(|&&b| b != marker)
        .filter_map(|b| text[after_marker..].find(b).map(|pos| after_marker + pos))
        .min()
        .unwrap_or(text.len());

    let mut result = String::with_capacity(text.len());
    result.push_str(&text[..start]);
    result.push_str(&text[end..]);
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    fn system_msg_with_rag(content: &str) -> ChatMessage {
        ChatMessage::system(content)
    }

    #[test]
    fn strips_rag_block_with_entries_from_old_message() {
        // Realistic format: preamble + numbered entries, followed by graph block
        let content = "You are an AI assistant.\n\n\
                       IMPORTANT — YOUR MEMORY:\n\
                       The items below are things you remember.\n\n\
                       1. [Fact] (relevance: 95%): User likes Rust\n\
                       2. [Preference] (relevance: 85%): Prefers dark mode\n\n\
                       YOUR KNOWLEDGE GRAPH:\n\
                       The entities below are facts.\n\n\
                       - Berlin [City] (relevance: 90%): Capital of Germany";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(!result.contains("KNOWLEDGE GRAPH:"));
        assert!(!result.contains("User likes Rust"));
        assert!(!result.contains("Berlin"));
        assert!(result.contains("You are an AI assistant"));
    }

    #[test]
    fn strips_rag_block_without_following_section() {
        // RAG block at the end with no subsequent section
        let content = "You are an AI assistant.\n\n\
                       IMPORTANT — YOUR MEMORY:\n\
                       The items below are things you remember.\n\n\
                       1. [Fact] (relevance: 95%): User likes Rust";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(!result.contains("User likes Rust"));
        assert!(result.contains("You are an AI assistant"));
    }

    #[test]
    fn strips_graph_block_from_old_message() {
        let content = "You are an AI assistant.\n\n\
                       YOUR KNOWLEDGE GRAPH:\n\
                       The entities below are facts.\n\n\
                       - Berlin [City]: Capital of Germany\n\n\
                       ## Response style\n\
                       Keep responses concise.";
        let result = strip_context_blocks(content);
        assert!(!result.contains("KNOWLEDGE GRAPH:"));
        assert!(!result.contains("Berlin"));
        assert!(result.contains("You are an AI assistant"));
        assert!(result.contains("## Response style"));
    }

    #[test]
    fn strips_both_blocks() {
        let content = "Base prompt.\n\n\
                       IMPORTANT — YOUR MEMORY:\n\
                       Preamble.\n\n\
                       1. [Fact]: Memory item 1\n\n\
                       YOUR KNOWLEDGE GRAPH:\n\
                       Preamble.\n\n\
                       - Entity 1\n\n\
                       ## Response style\n\
                       Final instructions.";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(!result.contains("Memory item 1"));
        assert!(!result.contains("KNOWLEDGE GRAPH:"));
        assert!(!result.contains("Entity 1"));
        assert!(result.contains("Base prompt"));
        assert!(result.contains("## Response style"));
    }

    #[test]
    fn preserves_content_without_markers() {
        let content = "Just a regular system prompt without any RAG context.";
        let result = strip_context_blocks(content);
        assert_eq!(result, content);
    }

    #[test]
    fn cross_turn_dedup_skips_recent_messages() {
        let mut messages = vec![
            system_msg_with_rag(
                "Old prompt.\n\n\
                 IMPORTANT — YOUR MEMORY:\n\
                 Preamble.\n\n\
                 1. [Fact]: Old memory",
            ),
            ChatMessage::user("Hello"),
            ChatMessage::assistant("Hi"),
        ];
        // current_turn=1, decay_window=5 → age=1 < 5, so nothing stripped
        let saved = deduplicate_rag_across_turns(&mut messages, 1, Some(5));
        assert_eq!(saved, 0);
        assert!(messages[0].content.contains("YOUR MEMORY:"));
    }

    #[test]
    fn cross_turn_dedup_strips_old_messages() {
        let mut messages = vec![
            system_msg_with_rag(
                "Old prompt.\n\n\
                 IMPORTANT — YOUR MEMORY:\n\
                 Preamble.\n\n\
                 1. [Fact]: Old memory\n\
                 2. [Preference]: Dark mode",
            ),
            ChatMessage::user("Hello"),
            ChatMessage::assistant("Hi"),
        ];
        // current_turn=10, decay_window=5 → age=10 > 5, so stripped
        let saved = deduplicate_rag_across_turns(&mut messages, 10, Some(5));
        assert!(saved > 0);
        assert!(!messages[0].content.contains("YOUR MEMORY:"));
        assert!(!messages[0].content.contains("Old memory"));
        assert!(!messages[0].content.contains("Dark mode"));
        assert!(messages[0].content.contains("Old prompt"));
    }

    #[test]
    fn does_not_modify_user_messages() {
        let mut messages = vec![ChatMessage::user(
            "IMPORTANT — YOUR MEMORY:\n- Fake marker in user msg",
        )];
        let saved = deduplicate_rag_across_turns(&mut messages, 10, Some(1));
        assert_eq!(saved, 0);
        assert!(messages[0].content.contains("YOUR MEMORY:"));
    }

    #[test]
    fn strip_block_at_end_of_string() {
        let content = "Prefix text.\n\nIMPORTANT — YOUR MEMORY:\n- Memory item at the end";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(result.contains("Prefix text"));
    }
}