use std::collections::HashSet;
use crate::types::ChatMessage;
#[derive(Debug)]
pub struct DeduplicationResult {
pub messages: Vec<ChatMessage>,
pub merged_count: usize,
}
#[must_use]
pub fn deduplicate_adjacent(messages: &[ChatMessage], threshold: f64) -> DeduplicationResult {
if messages.len() < 2 {
return DeduplicationResult {
messages: messages.to_vec(),
merged_count: 0,
};
}
let mut result: Vec<ChatMessage> = Vec::with_capacity(messages.len());
let mut merged_count: usize = 0;
result.push(messages[0].clone());
for msg in &messages[1..] {
let prev = &result[result.len() - 1];
if prev.role == msg.role && jaccard_similarity(&prev.content, &msg.content) > threshold {
if msg.content.len() > prev.content.len() {
let last_idx = result.len() - 1;
result[last_idx] = msg.clone();
}
merged_count += 1;
} else {
result.push(msg.clone());
}
}
DeduplicationResult {
messages: result,
merged_count,
}
}
fn jaccard_similarity(a: &str, b: &str) -> f64 {
let words_a: HashSet<&str> = a.split_whitespace().collect();
let words_b: HashSet<&str> = b.split_whitespace().collect();
if words_a.is_empty() && words_b.is_empty() {
return 1.0;
}
#[allow(clippy::cast_precision_loss)]
let intersection = words_a.intersection(&words_b).count() as f64;
#[allow(clippy::cast_precision_loss)]
let union = words_a.union(&words_b).count() as f64;
if union == 0.0 {
0.0
} else {
intersection / union
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_duplicates_unchanged() {
let msgs = vec![
ChatMessage::user("Hello"),
ChatMessage::assistant("Hi there!"),
ChatMessage::user("How are you?"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 3);
assert_eq!(result.merged_count, 0);
}
#[test]
fn identical_adjacent_merged() {
let msgs = vec![
ChatMessage::user("What is the weather today?"),
ChatMessage::user("What is the weather today?"),
ChatMessage::assistant("It's sunny!"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 2);
assert_eq!(result.merged_count, 1);
}
#[test]
fn similar_adjacent_merged() {
let msgs = vec![
ChatMessage::user("please help me with this coding task"),
ChatMessage::user("please help me with this coding task please"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 1);
assert_eq!(result.merged_count, 1);
assert!(result.messages[0].content.contains("please help me"));
}
#[test]
fn different_roles_not_merged() {
let msgs = vec![
ChatMessage::user("What is the weather?"),
ChatMessage::assistant("What is the weather?"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 2);
assert_eq!(result.merged_count, 0);
}
#[test]
fn dissimilar_adjacent_not_merged() {
let msgs = vec![
ChatMessage::user("What is the weather today?"),
ChatMessage::user("Tell me about quantum computing"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 2);
assert_eq!(result.merged_count, 0);
}
#[test]
fn three_identical_consecutive() {
let msgs = vec![
ChatMessage::user("hello world test"),
ChatMessage::user("hello world test"),
ChatMessage::user("hello world test"),
];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 1);
assert_eq!(result.merged_count, 2);
}
#[test]
fn empty_messages() {
let msgs: Vec<ChatMessage> = Vec::new();
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 0);
assert_eq!(result.merged_count, 0);
}
#[test]
fn single_message() {
let msgs = vec![ChatMessage::user("Hello")];
let result = deduplicate_adjacent(&msgs, 0.7);
assert_eq!(result.messages.len(), 1);
assert_eq!(result.merged_count, 0);
}
#[test]
fn longer_message_kept_on_merge() {
let msgs = vec![
ChatMessage::user("short help me with code"),
ChatMessage::user("short help me with code and more things"),
];
let result = deduplicate_adjacent(&msgs, 0.4); assert_eq!(result.messages.len(), 1);
assert!(result.messages[0].content.len() > 10);
}
#[test]
fn jaccard_identical_texts() {
let sim = jaccard_similarity("hello world", "hello world");
assert!((sim - 1.0).abs() < f64::EPSILON);
}
#[test]
fn jaccard_disjoint_texts() {
let sim = jaccard_similarity("hello world", "foo bar");
assert!(sim.abs() < f64::EPSILON);
}
#[test]
fn jaccard_empty_texts() {
let sim = jaccard_similarity("", "");
assert!((sim - 1.0).abs() < f64::EPSILON);
}
}