use crate::error::StorageError;
use crate::storage::DbPool;
use std::collections::HashSet;
pub struct DedupChecker {
pool: DbPool,
}
impl DedupChecker {
pub fn new(pool: DbPool) -> Self {
Self { pool }
}
pub async fn has_replied_to(&self, tweet_id: &str) -> Result<bool, StorageError> {
crate::storage::replies::has_replied_to(&self.pool, tweet_id).await
}
pub async fn is_phrasing_similar(
&self,
new_reply: &str,
limit: i64,
) -> Result<bool, StorageError> {
if new_reply.is_empty() {
return Ok(false);
}
let recent = crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await?;
let new_tokens = tokenize(new_reply);
for recent_reply in &recent {
if new_reply == recent_reply {
return Ok(true);
}
if new_tokens.len() < 5 {
continue;
}
let recent_tokens = tokenize(recent_reply);
if jaccard_similarity(&new_tokens, &recent_tokens) >= 0.8 {
return Ok(true);
}
}
Ok(false)
}
pub async fn get_recent_reply_phrases(&self, limit: i64) -> Result<Vec<String>, StorageError> {
crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await
}
}
fn tokenize(text: &str) -> HashSet<String> {
text.to_lowercase()
.split_whitespace()
.map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
.filter(|w| !w.is_empty())
.collect()
}
fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
if a.is_empty() && b.is_empty() {
return 1.0;
}
let intersection = a.intersection(b).count() as f64;
let union = a.union(b).count() as f64;
intersection / union
}
#[cfg(test)]
mod tests {
use super::*;
use crate::storage::init_test_db;
use crate::storage::replies::{insert_reply, ReplySent};
fn sample_reply(target_id: &str, content: &str) -> ReplySent {
ReplySent {
id: 0,
target_tweet_id: target_id.to_string(),
reply_tweet_id: Some("r_123".to_string()),
reply_content: content.to_string(),
llm_provider: None,
llm_model: None,
created_at: chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(),
status: "sent".to_string(),
error_message: None,
}
}
#[test]
fn tokenize_basic() {
let tokens = tokenize("Hello, World! This is a test.");
assert!(tokens.contains("hello"));
assert!(tokens.contains("world"));
assert!(tokens.contains("test"));
assert!(!tokens.contains(""));
}
#[test]
fn tokenize_strips_punctuation() {
let tokens = tokenize("(great) [tool] {for} developers!");
assert!(tokens.contains("great"));
assert!(tokens.contains("tool"));
assert!(tokens.contains("for"));
assert!(tokens.contains("developers"));
}
#[test]
fn tokenize_empty_string() {
let tokens = tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn jaccard_identical_sets() {
let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
let b = a.clone();
assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn jaccard_disjoint_sets() {
let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
let b: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
assert!((jaccard_similarity(&a, &b)).abs() < f64::EPSILON);
}
#[test]
fn jaccard_partial_overlap() {
let a: HashSet<String> = ["hello", "world", "foo"]
.iter()
.map(|s| s.to_string())
.collect();
let b: HashSet<String> = ["hello", "world", "bar"]
.iter()
.map(|s| s.to_string())
.collect();
let sim = jaccard_similarity(&a, &b);
assert!((sim - 0.5).abs() < f64::EPSILON);
}
#[test]
fn jaccard_empty_sets() {
let a: HashSet<String> = HashSet::new();
let b: HashSet<String> = HashSet::new();
assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
}
#[tokio::test]
async fn has_replied_to_works() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
assert!(!checker.has_replied_to("tweet_123").await.expect("check"));
let reply = sample_reply("tweet_123", "Some reply");
insert_reply(&pool, &reply).await.expect("insert");
assert!(checker.has_replied_to("tweet_123").await.expect("check"));
assert!(!checker.has_replied_to("tweet_456").await.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_exact_match() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply("t1", "This is a great tool for developers");
insert_reply(&pool, &reply).await.expect("insert");
assert!(checker
.is_phrasing_similar("This is a great tool for developers", 20)
.await
.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_high_overlap() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply("t1", "This is a great tool for developers and engineers");
insert_reply(&pool, &reply).await.expect("insert");
assert!(checker
.is_phrasing_similar("This is a great tool for developers and designers", 20)
.await
.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_no_overlap() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply("t1", "This is a great tool for developers and engineers");
insert_reply(&pool, &reply).await.expect("insert");
assert!(!checker
.is_phrasing_similar("I love cooking pasta with fresh basil and tomatoes", 20)
.await
.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_empty_string() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply("t1", "Some reply");
insert_reply(&pool, &reply).await.expect("insert");
assert!(!checker.is_phrasing_similar("", 20).await.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_short_reply_skips_similarity() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply("t1", "Great point!");
insert_reply(&pool, &reply).await.expect("insert");
assert!(checker
.is_phrasing_similar("Great point!", 20)
.await
.expect("check"));
assert!(!checker
.is_phrasing_similar("Good point!", 20)
.await
.expect("check"));
}
#[tokio::test]
async fn is_phrasing_similar_no_recent_replies() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
assert!(!checker
.is_phrasing_similar("Any reply text here that is long enough to test", 20)
.await
.expect("check"));
}
#[tokio::test]
async fn get_recent_reply_phrases_works() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let r1 = sample_reply("t1", "Reply one");
let r2 = sample_reply("t2", "Reply two");
insert_reply(&pool, &r1).await.expect("ins1");
insert_reply(&pool, &r2).await.expect("ins2");
let phrases = checker.get_recent_reply_phrases(5).await.expect("get");
assert_eq!(phrases.len(), 2);
}
#[test]
fn tokenize_mixed_punctuation() {
let tokens = tokenize("Hello! World? This... is (a) test.");
assert!(tokens.contains("hello"));
assert!(tokens.contains("world"));
assert!(tokens.contains("this"));
assert!(tokens.contains("a"));
assert!(tokens.contains("test"));
}
#[test]
fn tokenize_duplicate_words() {
let tokens = tokenize("hello hello hello world");
assert_eq!(tokens.len(), 2);
}
#[test]
fn tokenize_only_punctuation() {
let tokens = tokenize("!!! ... ???");
assert!(tokens.is_empty());
}
#[test]
fn tokenize_single_word() {
let tokens = tokenize("hello");
assert_eq!(tokens.len(), 1);
assert!(tokens.contains("hello"));
}
#[test]
fn tokenize_case_normalization() {
let tokens = tokenize("Hello HELLO hello");
assert_eq!(tokens.len(), 1);
assert!(tokens.contains("hello"));
}
#[test]
fn jaccard_one_empty_one_not() {
let a: HashSet<String> = HashSet::new();
let b: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
assert!((jaccard_similarity(&a, &b) - 0.0).abs() < f64::EPSILON);
}
#[test]
fn jaccard_single_element_match() {
let a: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
let b: HashSet<String> = ["hello"].iter().map(|s| s.to_string()).collect();
assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn jaccard_superset() {
let a: HashSet<String> = ["hello", "world", "foo"]
.iter()
.map(|s| s.to_string())
.collect();
let b: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
let sim = jaccard_similarity(&a, &b);
assert!((sim - 2.0 / 3.0).abs() < 0.001);
}
#[test]
fn jaccard_high_overlap_threshold() {
let common: Vec<String> = (0..8).map(|i| format!("word{i}")).collect();
let mut a: HashSet<String> = common.iter().cloned().collect();
a.insert("unique_a".to_string());
let mut b: HashSet<String> = common.iter().cloned().collect();
b.insert("unique_b".to_string());
let sim = jaccard_similarity(&a, &b);
assert!(sim >= 0.8);
}
#[tokio::test]
async fn has_replied_to_multiple_tweets() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let r1 = sample_reply("tweet_100", "Reply 1");
let r2 = sample_reply("tweet_200", "Reply 2");
insert_reply(&pool, &r1).await.expect("ins1");
insert_reply(&pool, &r2).await.expect("ins2");
assert!(checker.has_replied_to("tweet_100").await.expect("check"));
assert!(checker.has_replied_to("tweet_200").await.expect("check"));
assert!(!checker.has_replied_to("tweet_300").await.expect("check"));
}
#[tokio::test]
async fn get_recent_reply_phrases_respects_limit() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
for i in 0..5 {
let reply = sample_reply(&format!("t{i}"), &format!("Reply content {i}"));
insert_reply(&pool, &reply).await.expect("insert");
}
let phrases = checker.get_recent_reply_phrases(3).await.expect("get");
assert_eq!(phrases.len(), 3);
}
#[tokio::test]
async fn is_phrasing_similar_moderate_overlap_not_triggered() {
let pool = init_test_db().await.expect("init db");
let checker = DedupChecker::new(pool.clone());
let reply = sample_reply(
"t1",
"Rust has an amazing type system with lifetimes and borrowing rules",
);
insert_reply(&pool, &reply).await.expect("insert");
assert!(!checker
.is_phrasing_similar(
"Python has dynamic typing with duck typing and runtime checks",
20,
)
.await
.expect("check"));
}
}