ai_tokenopt 0.5.6

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Cross-turn RAG and knowledge graph context deduplication.
//!
//! Strips embedded RAG memory and knowledge graph context blocks from
//! messages older than a configurable turn decay window, keeping only
//! the most recent occurrences. This prevents the same context from
//! being repeated across many turns, saving significant tokens.

use crate::estimator::TokenEstimator;
use crate::types::{ChatMessage, MessageRole};

/// Default number of recent turns to preserve RAG/graph context in.
const DEFAULT_DECAY_WINDOW: usize = 5;

/// Marker that starts an embedded RAG memory block.
const RAG_MARKER: &str = "IMPORTANT — YOUR MEMORY:";

/// Marker that starts an embedded knowledge graph block.
const GRAPH_MARKER: &str = "YOUR KNOWLEDGE GRAPH:";

/// Remove embedded RAG and knowledge graph context from old messages.
///
/// For messages older than `decay_window` turns (measured from the end),
/// strips any RAG memory or knowledge graph context blocks. This prevents
/// the same context from being repeated across many turns.
///
/// # Arguments
///
/// * `messages` — Conversation messages to process (modified in place)
/// * `current_turn` — The current (latest) turn index
/// * `decay_window` — Number of recent turns to preserve context in
///
/// # Returns
///
/// The estimated tokens saved by stripping old context blocks.
pub fn deduplicate_rag_across_turns(
    messages: &mut [ChatMessage],
    current_turn: usize,
    decay_window: Option<usize>,
) -> u32 {
    let window = decay_window.unwrap_or(DEFAULT_DECAY_WINDOW);
    let mut tokens_saved = 0u32;

    for (i, msg) in messages.iter_mut().enumerate() {
        // Only process system messages (where RAG/graph context is injected)
        if msg.role != MessageRole::System {
            continue;
        }

        // Calculate turn age: messages at the end are the newest
        let turn_index = i / 2;
        let age = current_turn.saturating_sub(turn_index);
        if age < window {
            continue;
        }

        let before_tokens = TokenEstimator::estimate_tokens(&msg.content);
        msg.content = strip_context_blocks(&msg.content);
        let after_tokens = TokenEstimator::estimate_tokens(&msg.content);
        tokens_saved += before_tokens.saturating_sub(after_tokens);
    }

    tokens_saved
}

/// Strip RAG memory and knowledge graph blocks from a message's content.
///
/// Removes everything from the marker to the next double newline or end of string.
fn strip_context_blocks(content: &str) -> String {
    let mut result = content.to_string();
    result = strip_block(&result, RAG_MARKER);
    result = strip_block(&result, GRAPH_MARKER);

    // Clean up any leftover multiple blank lines
    while result.contains("\n\n\n") {
        result = result.replace("\n\n\n", "\n\n");
    }

    result.trim().to_string()
}

/// Strip a single context block starting at `marker` from the text.
///
/// Removes from the marker to the next double newline (or end of string).
fn strip_block(text: &str, marker: &str) -> String {
    let Some(start) = text.find(marker) else {
        return text.to_string();
    };

    // Find the end of the block: next double newline after the marker,
    // or end of string
    let after_marker = start + marker.len();
    let end = text[after_marker..]
        .find("\n\n")
        .map_or(text.len(), |pos| after_marker + pos + 2);

    let mut result = String::with_capacity(text.len());
    result.push_str(&text[..start]);
    result.push_str(&text[end..]);
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    fn system_msg_with_rag(content: &str) -> ChatMessage {
        ChatMessage::system(content)
    }

    #[test]
    fn strips_rag_block_from_old_message() {
        let content = "You are an AI assistant.\n\n\
                       IMPORTANT — YOUR MEMORY:\n\
                       The items below are things you remember.\n\
                       - User likes Rust\n\n\
                       Continue helping the user.";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(result.contains("You are an AI assistant"));
        assert!(result.contains("Continue helping the user"));
    }

    #[test]
    fn strips_graph_block_from_old_message() {
        let content = "You are an AI assistant.\n\n\
                       YOUR KNOWLEDGE GRAPH:\n\
                       - Berlin [City]: Capital of Germany\n\n\
                       Continue helping the user.";
        let result = strip_context_blocks(content);
        assert!(!result.contains("KNOWLEDGE GRAPH:"));
        assert!(result.contains("You are an AI assistant"));
    }

    #[test]
    fn strips_both_blocks() {
        let content = "Base prompt.\n\n\
                       IMPORTANT — YOUR MEMORY:\n\
                       - Memory item 1\n\n\
                       YOUR KNOWLEDGE GRAPH:\n\
                       - Entity 1\n\n\
                       Final instructions.";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(!result.contains("KNOWLEDGE GRAPH:"));
        assert!(result.contains("Base prompt"));
        assert!(result.contains("Final instructions"));
    }

    #[test]
    fn preserves_content_without_markers() {
        let content = "Just a regular system prompt without any RAG context.";
        let result = strip_context_blocks(content);
        assert_eq!(result, content);
    }

    #[test]
    fn cross_turn_dedup_skips_recent_messages() {
        let mut messages = vec![
            system_msg_with_rag("Old prompt.\n\nIMPORTANT — YOUR MEMORY:\n- Old memory\n\nDone."),
            ChatMessage::user("Hello"),
            ChatMessage::assistant("Hi"),
        ];
        // current_turn=1, decay_window=5 → age=1 < 5, so nothing stripped
        let saved = deduplicate_rag_across_turns(&mut messages, 1, Some(5));
        assert_eq!(saved, 0);
        assert!(messages[0].content.contains("YOUR MEMORY:"));
    }

    #[test]
    fn cross_turn_dedup_strips_old_messages() {
        let mut messages = vec![
            system_msg_with_rag("Old prompt.\n\nIMPORTANT — YOUR MEMORY:\n- Old memory\n\nDone."),
            ChatMessage::user("Hello"),
            ChatMessage::assistant("Hi"),
        ];
        // current_turn=10, decay_window=5 → age=10 > 5, so stripped
        let saved = deduplicate_rag_across_turns(&mut messages, 10, Some(5));
        assert!(saved > 0);
        assert!(!messages[0].content.contains("YOUR MEMORY:"));
    }

    #[test]
    fn does_not_modify_user_messages() {
        let mut messages = vec![ChatMessage::user(
            "IMPORTANT — YOUR MEMORY:\n- Fake marker in user msg",
        )];
        let saved = deduplicate_rag_across_turns(&mut messages, 10, Some(1));
        assert_eq!(saved, 0);
        assert!(messages[0].content.contains("YOUR MEMORY:"));
    }

    #[test]
    fn strip_block_at_end_of_string() {
        let content = "Prefix text.\n\nIMPORTANT — YOUR MEMORY:\n- Memory item at the end";
        let result = strip_context_blocks(content);
        assert!(!result.contains("YOUR MEMORY:"));
        assert!(result.contains("Prefix text"));
    }
}