Skip to main content

tuitbot_core/safety/
dedup.rs

1//! Duplicate reply prevention.
2//!
3//! Provides exact-match deduplication (never reply to the same tweet twice)
4//! and phrasing similarity detection (reject replies too similar to recent ones).
5
6use crate::error::StorageError;
7use crate::storage::DbPool;
8use std::collections::HashSet;
9
10/// Checks for duplicate and similar replies.
11pub struct DedupChecker {
12    pool: DbPool,
13}
14
15impl DedupChecker {
16    /// Create a new dedup checker backed by the given database pool.
17    pub fn new(pool: DbPool) -> Self {
18        Self { pool }
19    }
20
21    /// Check if a reply has already been sent to the given tweet.
22    ///
23    /// Returns `true` if a reply exists in `replies_sent` for this tweet ID.
24    pub async fn has_replied_to(&self, tweet_id: &str) -> Result<bool, StorageError> {
25        crate::storage::replies::has_replied_to(&self.pool, tweet_id).await
26    }
27
28    /// Check if a proposed reply is too similar to recent replies.
29    ///
30    /// Compares against the last `limit` replies using Jaccard word similarity.
31    /// Returns `true` if any recent reply has >= 0.8 similarity or is an exact match.
32    /// Replies shorter than 5 words skip the similarity check (too short for meaningful comparison).
33    pub async fn is_phrasing_similar(
34        &self,
35        new_reply: &str,
36        limit: i64,
37    ) -> Result<bool, StorageError> {
38        if new_reply.is_empty() {
39            return Ok(false);
40        }
41
42        let recent = crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await?;
43        let new_tokens = tokenize(new_reply);
44
45        for recent_reply in &recent {
46            // Exact match check
47            if new_reply == recent_reply {
48                return Ok(true);
49            }
50
51            // Skip similarity check for very short replies
52            if new_tokens.len() < 5 {
53                continue;
54            }
55
56            let recent_tokens = tokenize(recent_reply);
57            if jaccard_similarity(&new_tokens, &recent_tokens) >= 0.8 {
58                return Ok(true);
59            }
60        }
61
62        Ok(false)
63    }
64
65    /// Get recent reply contents for testing and debugging.
66    pub async fn get_recent_reply_phrases(&self, limit: i64) -> Result<Vec<String>, StorageError> {
67        crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await
68    }
69}
70
71/// Tokenize text into a set of lowercase alphanumeric words.
72fn tokenize(text: &str) -> HashSet<String> {
73    text.to_lowercase()
74        .split_whitespace()
75        .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
76        .filter(|w| !w.is_empty())
77        .collect()
78}
79
80/// Calculate Jaccard similarity between two word sets.
81///
82/// Returns a value between 0.0 (no overlap) and 1.0 (identical sets).
83/// Two empty sets are considered identical (returns 1.0).
84fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
85    if a.is_empty() && b.is_empty() {
86        return 1.0;
87    }
88    let intersection = a.intersection(b).count() as f64;
89    let union = a.union(b).count() as f64;
90    intersection / union
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96    use crate::storage::init_test_db;
97    use crate::storage::replies::{insert_reply, ReplySent};
98
99    fn sample_reply(target_id: &str, content: &str) -> ReplySent {
100        ReplySent {
101            id: 0,
102            target_tweet_id: target_id.to_string(),
103            reply_tweet_id: Some("r_123".to_string()),
104            reply_content: content.to_string(),
105            llm_provider: None,
106            llm_model: None,
107            created_at: chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(),
108            status: "sent".to_string(),
109            error_message: None,
110        }
111    }
112
113    #[test]
114    fn tokenize_basic() {
115        let tokens = tokenize("Hello, World! This is a test.");
116        assert!(tokens.contains("hello"));
117        assert!(tokens.contains("world"));
118        assert!(tokens.contains("test"));
119        assert!(!tokens.contains(""));
120    }
121
122    #[test]
123    fn tokenize_strips_punctuation() {
124        let tokens = tokenize("(great) [tool] {for} developers!");
125        assert!(tokens.contains("great"));
126        assert!(tokens.contains("tool"));
127        assert!(tokens.contains("for"));
128        assert!(tokens.contains("developers"));
129    }
130
131    #[test]
132    fn tokenize_empty_string() {
133        let tokens = tokenize("");
134        assert!(tokens.is_empty());
135    }
136
137    #[test]
138    fn jaccard_identical_sets() {
139        let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
140        let b = a.clone();
141        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
142    }
143
144    #[test]
145    fn jaccard_disjoint_sets() {
146        let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
147        let b: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
148        assert!((jaccard_similarity(&a, &b)).abs() < f64::EPSILON);
149    }
150
151    #[test]
152    fn jaccard_partial_overlap() {
153        let a: HashSet<String> = ["hello", "world", "foo"]
154            .iter()
155            .map(|s| s.to_string())
156            .collect();
157        let b: HashSet<String> = ["hello", "world", "bar"]
158            .iter()
159            .map(|s| s.to_string())
160            .collect();
161        // intersection=2, union=4 => 0.5
162        let sim = jaccard_similarity(&a, &b);
163        assert!((sim - 0.5).abs() < f64::EPSILON);
164    }
165
166    #[test]
167    fn jaccard_empty_sets() {
168        let a: HashSet<String> = HashSet::new();
169        let b: HashSet<String> = HashSet::new();
170        assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
171    }
172
173    #[tokio::test]
174    async fn has_replied_to_works() {
175        let pool = init_test_db().await.expect("init db");
176        let checker = DedupChecker::new(pool.clone());
177
178        assert!(!checker.has_replied_to("tweet_123").await.expect("check"));
179
180        let reply = sample_reply("tweet_123", "Some reply");
181        insert_reply(&pool, &reply).await.expect("insert");
182
183        assert!(checker.has_replied_to("tweet_123").await.expect("check"));
184        assert!(!checker.has_replied_to("tweet_456").await.expect("check"));
185    }
186
187    #[tokio::test]
188    async fn is_phrasing_similar_exact_match() {
189        let pool = init_test_db().await.expect("init db");
190        let checker = DedupChecker::new(pool.clone());
191
192        let reply = sample_reply("t1", "This is a great tool for developers");
193        insert_reply(&pool, &reply).await.expect("insert");
194
195        assert!(checker
196            .is_phrasing_similar("This is a great tool for developers", 20)
197            .await
198            .expect("check"));
199    }
200
201    #[tokio::test]
202    async fn is_phrasing_similar_high_overlap() {
203        let pool = init_test_db().await.expect("init db");
204        let checker = DedupChecker::new(pool.clone());
205
206        let reply = sample_reply("t1", "This is a great tool for developers and engineers");
207        insert_reply(&pool, &reply).await.expect("insert");
208
209        // Very similar phrasing (most words overlap)
210        assert!(checker
211            .is_phrasing_similar("This is a great tool for developers and designers", 20)
212            .await
213            .expect("check"));
214    }
215
216    #[tokio::test]
217    async fn is_phrasing_similar_no_overlap() {
218        let pool = init_test_db().await.expect("init db");
219        let checker = DedupChecker::new(pool.clone());
220
221        let reply = sample_reply("t1", "This is a great tool for developers and engineers");
222        insert_reply(&pool, &reply).await.expect("insert");
223
224        assert!(!checker
225            .is_phrasing_similar("I love cooking pasta with fresh basil and tomatoes", 20)
226            .await
227            .expect("check"));
228    }
229
230    #[tokio::test]
231    async fn is_phrasing_similar_empty_string() {
232        let pool = init_test_db().await.expect("init db");
233        let checker = DedupChecker::new(pool.clone());
234
235        let reply = sample_reply("t1", "Some reply");
236        insert_reply(&pool, &reply).await.expect("insert");
237
238        assert!(!checker.is_phrasing_similar("", 20).await.expect("check"));
239    }
240
241    #[tokio::test]
242    async fn is_phrasing_similar_short_reply_skips_similarity() {
243        let pool = init_test_db().await.expect("init db");
244        let checker = DedupChecker::new(pool.clone());
245
246        let reply = sample_reply("t1", "Great point!");
247        insert_reply(&pool, &reply).await.expect("insert");
248
249        // Short reply (< 5 words) - exact match still works
250        assert!(checker
251            .is_phrasing_similar("Great point!", 20)
252            .await
253            .expect("check"));
254
255        // But similar short phrases don't trigger (avoids false positives)
256        assert!(!checker
257            .is_phrasing_similar("Good point!", 20)
258            .await
259            .expect("check"));
260    }
261
262    #[tokio::test]
263    async fn is_phrasing_similar_no_recent_replies() {
264        let pool = init_test_db().await.expect("init db");
265        let checker = DedupChecker::new(pool.clone());
266
267        assert!(!checker
268            .is_phrasing_similar("Any reply text here that is long enough to test", 20)
269            .await
270            .expect("check"));
271    }
272
273    #[tokio::test]
274    async fn get_recent_reply_phrases_works() {
275        let pool = init_test_db().await.expect("init db");
276        let checker = DedupChecker::new(pool.clone());
277
278        let r1 = sample_reply("t1", "Reply one");
279        let r2 = sample_reply("t2", "Reply two");
280        insert_reply(&pool, &r1).await.expect("ins1");
281        insert_reply(&pool, &r2).await.expect("ins2");
282
283        let phrases = checker.get_recent_reply_phrases(5).await.expect("get");
284        assert_eq!(phrases.len(), 2);
285    }
286}