ai_tokenopt 0.5.10

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Adjacent message deduplication.
//!
//! Detects and merges consecutive messages with highly overlapping content
//! (Jaccard similarity > 70%). Useful when users paste the same context
//! or the assistant repeats instructions across turns.

use std::collections::HashSet;

use crate::types::ChatMessage;

/// Result of deduplicating adjacent messages.
#[derive(Debug)]
pub struct DeduplicationResult {
    /// Messages after deduplication (duplicates merged)
    pub messages: Vec<ChatMessage>,
    /// Number of messages that were merged away
    pub merged_count: usize,
}

/// Deduplicate adjacent messages that share > threshold Jaccard similarity.
///
/// When adjacent messages from the **same role** exceed the similarity
/// threshold, the longer message is kept and the shorter is discarded.
/// This handles repeated pastes, echoed instructions, and other redundancies.
///
/// # Arguments
///
/// * `messages` — Conversation messages in chronological order
/// * `threshold` — Jaccard similarity threshold (0.0–1.0, recommended 0.7)
///
/// # Examples
///
/// ```
/// use ai_tokenopt::history::dedup::{deduplicate_adjacent, DeduplicationResult};
/// use ai_tokenopt::types::{ChatMessage, MessageRole};
///
/// let messages = vec![
///     ChatMessage::user("please help me with this task"),
///     ChatMessage::user("please help me with this task please"),
///     ChatMessage::assistant("Sure, I can help!"),
/// ];
/// let result = deduplicate_adjacent(&messages, 0.7);
/// assert_eq!(result.messages.len(), 2);
/// assert_eq!(result.merged_count, 1);
/// ```
#[must_use]
pub fn deduplicate_adjacent(messages: &[ChatMessage], threshold: f64) -> DeduplicationResult {
    if messages.len() < 2 {
        return DeduplicationResult {
            messages: messages.to_vec(),
            merged_count: 0,
        };
    }

    let mut result: Vec<ChatMessage> = Vec::with_capacity(messages.len());
    let mut merged_count: usize = 0;

    result.push(messages[0].clone());

    for msg in &messages[1..] {
        // Safety: result always has at least one element (pushed above)
        let prev = &result[result.len() - 1];

        if prev.role == msg.role && jaccard_similarity(&prev.content, &msg.content) > threshold {
            // Keep the longer one
            if msg.content.len() > prev.content.len() {
                let last_idx = result.len() - 1;
                result[last_idx] = msg.clone();
            }
            merged_count += 1;
        } else {
            result.push(msg.clone());
        }
    }

    DeduplicationResult {
        messages: result,
        merged_count,
    }
}

/// Compute Jaccard similarity between two texts based on word sets.
fn jaccard_similarity(a: &str, b: &str) -> f64 {
    let words_a: HashSet<&str> = a.split_whitespace().collect();
    let words_b: HashSet<&str> = b.split_whitespace().collect();

    if words_a.is_empty() && words_b.is_empty() {
        return 1.0;
    }

    #[allow(clippy::cast_precision_loss)]
    let intersection = words_a.intersection(&words_b).count() as f64;
    #[allow(clippy::cast_precision_loss)]
    let union = words_a.union(&words_b).count() as f64;

    if union == 0.0 {
        0.0
    } else {
        intersection / union
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn no_duplicates_unchanged() {
        let msgs = vec![
            ChatMessage::user("Hello"),
            ChatMessage::assistant("Hi there!"),
            ChatMessage::user("How are you?"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 3);
        assert_eq!(result.merged_count, 0);
    }

    #[test]
    fn identical_adjacent_merged() {
        let msgs = vec![
            ChatMessage::user("What is the weather today?"),
            ChatMessage::user("What is the weather today?"),
            ChatMessage::assistant("It's sunny!"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 2);
        assert_eq!(result.merged_count, 1);
    }

    #[test]
    fn similar_adjacent_merged() {
        let msgs = vec![
            ChatMessage::user("please help me with this coding task"),
            ChatMessage::user("please help me with this coding task please"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 1);
        assert_eq!(result.merged_count, 1);
        // The longer one is kept
        assert!(result.messages[0].content.contains("please help me"));
    }

    #[test]
    fn different_roles_not_merged() {
        let msgs = vec![
            ChatMessage::user("What is the weather?"),
            ChatMessage::assistant("What is the weather?"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 2);
        assert_eq!(result.merged_count, 0);
    }

    #[test]
    fn dissimilar_adjacent_not_merged() {
        let msgs = vec![
            ChatMessage::user("What is the weather today?"),
            ChatMessage::user("Tell me about quantum computing"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 2);
        assert_eq!(result.merged_count, 0);
    }

    #[test]
    fn three_identical_consecutive() {
        let msgs = vec![
            ChatMessage::user("hello world test"),
            ChatMessage::user("hello world test"),
            ChatMessage::user("hello world test"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 1);
        assert_eq!(result.merged_count, 2);
    }

    #[test]
    fn empty_messages() {
        let msgs: Vec<ChatMessage> = Vec::new();
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 0);
        assert_eq!(result.merged_count, 0);
    }

    #[test]
    fn single_message() {
        let msgs = vec![ChatMessage::user("Hello")];
        let result = deduplicate_adjacent(&msgs, 0.7);
        assert_eq!(result.messages.len(), 1);
        assert_eq!(result.merged_count, 0);
    }

    #[test]
    fn longer_message_kept_on_merge() {
        let msgs = vec![
            ChatMessage::user("short help me with code"),
            ChatMessage::user("short help me with code and more things"),
        ];
        let result = deduplicate_adjacent(&msgs, 0.4); // lower threshold for this test
        assert_eq!(result.messages.len(), 1);
        assert!(result.messages[0].content.len() > 10);
    }

    #[test]
    fn jaccard_identical_texts() {
        let sim = jaccard_similarity("hello world", "hello world");
        assert!((sim - 1.0).abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_disjoint_texts() {
        let sim = jaccard_similarity("hello world", "foo bar");
        assert!(sim.abs() < f64::EPSILON);
    }

    #[test]
    fn jaccard_empty_texts() {
        let sim = jaccard_similarity("", "");
        assert!((sim - 1.0).abs() < f64::EPSILON);
    }
}