tuitbot_core/safety/
dedup.rs1use crate::error::StorageError;
7use crate::storage::DbPool;
8use std::collections::HashSet;
9
10pub struct DedupChecker {
12 pool: DbPool,
13}
14
15impl DedupChecker {
16 pub fn new(pool: DbPool) -> Self {
18 Self { pool }
19 }
20
21 pub async fn has_replied_to(&self, tweet_id: &str) -> Result<bool, StorageError> {
25 crate::storage::replies::has_replied_to(&self.pool, tweet_id).await
26 }
27
28 pub async fn is_phrasing_similar(
34 &self,
35 new_reply: &str,
36 limit: i64,
37 ) -> Result<bool, StorageError> {
38 if new_reply.is_empty() {
39 return Ok(false);
40 }
41
42 let recent = crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await?;
43 let new_tokens = tokenize(new_reply);
44
45 for recent_reply in &recent {
46 if new_reply == recent_reply {
48 return Ok(true);
49 }
50
51 if new_tokens.len() < 5 {
53 continue;
54 }
55
56 let recent_tokens = tokenize(recent_reply);
57 if jaccard_similarity(&new_tokens, &recent_tokens) >= 0.8 {
58 return Ok(true);
59 }
60 }
61
62 Ok(false)
63 }
64
65 pub async fn get_recent_reply_phrases(&self, limit: i64) -> Result<Vec<String>, StorageError> {
67 crate::storage::replies::get_recent_reply_contents(&self.pool, limit).await
68 }
69}
70
71fn tokenize(text: &str) -> HashSet<String> {
73 text.to_lowercase()
74 .split_whitespace()
75 .map(|w| w.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
76 .filter(|w| !w.is_empty())
77 .collect()
78}
79
80fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
85 if a.is_empty() && b.is_empty() {
86 return 1.0;
87 }
88 let intersection = a.intersection(b).count() as f64;
89 let union = a.union(b).count() as f64;
90 intersection / union
91}
92
93#[cfg(test)]
94mod tests {
95 use super::*;
96 use crate::storage::init_test_db;
97 use crate::storage::replies::{insert_reply, ReplySent};
98
99 fn sample_reply(target_id: &str, content: &str) -> ReplySent {
100 ReplySent {
101 id: 0,
102 target_tweet_id: target_id.to_string(),
103 reply_tweet_id: Some("r_123".to_string()),
104 reply_content: content.to_string(),
105 llm_provider: None,
106 llm_model: None,
107 created_at: chrono::Utc::now().format("%Y-%m-%dT%H:%M:%SZ").to_string(),
108 status: "sent".to_string(),
109 error_message: None,
110 }
111 }
112
113 #[test]
114 fn tokenize_basic() {
115 let tokens = tokenize("Hello, World! This is a test.");
116 assert!(tokens.contains("hello"));
117 assert!(tokens.contains("world"));
118 assert!(tokens.contains("test"));
119 assert!(!tokens.contains(""));
120 }
121
122 #[test]
123 fn tokenize_strips_punctuation() {
124 let tokens = tokenize("(great) [tool] {for} developers!");
125 assert!(tokens.contains("great"));
126 assert!(tokens.contains("tool"));
127 assert!(tokens.contains("for"));
128 assert!(tokens.contains("developers"));
129 }
130
131 #[test]
132 fn tokenize_empty_string() {
133 let tokens = tokenize("");
134 assert!(tokens.is_empty());
135 }
136
137 #[test]
138 fn jaccard_identical_sets() {
139 let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
140 let b = a.clone();
141 assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
142 }
143
144 #[test]
145 fn jaccard_disjoint_sets() {
146 let a: HashSet<String> = ["hello", "world"].iter().map(|s| s.to_string()).collect();
147 let b: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
148 assert!((jaccard_similarity(&a, &b)).abs() < f64::EPSILON);
149 }
150
151 #[test]
152 fn jaccard_partial_overlap() {
153 let a: HashSet<String> = ["hello", "world", "foo"]
154 .iter()
155 .map(|s| s.to_string())
156 .collect();
157 let b: HashSet<String> = ["hello", "world", "bar"]
158 .iter()
159 .map(|s| s.to_string())
160 .collect();
161 let sim = jaccard_similarity(&a, &b);
163 assert!((sim - 0.5).abs() < f64::EPSILON);
164 }
165
166 #[test]
167 fn jaccard_empty_sets() {
168 let a: HashSet<String> = HashSet::new();
169 let b: HashSet<String> = HashSet::new();
170 assert!((jaccard_similarity(&a, &b) - 1.0).abs() < f64::EPSILON);
171 }
172
173 #[tokio::test]
174 async fn has_replied_to_works() {
175 let pool = init_test_db().await.expect("init db");
176 let checker = DedupChecker::new(pool.clone());
177
178 assert!(!checker.has_replied_to("tweet_123").await.expect("check"));
179
180 let reply = sample_reply("tweet_123", "Some reply");
181 insert_reply(&pool, &reply).await.expect("insert");
182
183 assert!(checker.has_replied_to("tweet_123").await.expect("check"));
184 assert!(!checker.has_replied_to("tweet_456").await.expect("check"));
185 }
186
187 #[tokio::test]
188 async fn is_phrasing_similar_exact_match() {
189 let pool = init_test_db().await.expect("init db");
190 let checker = DedupChecker::new(pool.clone());
191
192 let reply = sample_reply("t1", "This is a great tool for developers");
193 insert_reply(&pool, &reply).await.expect("insert");
194
195 assert!(checker
196 .is_phrasing_similar("This is a great tool for developers", 20)
197 .await
198 .expect("check"));
199 }
200
201 #[tokio::test]
202 async fn is_phrasing_similar_high_overlap() {
203 let pool = init_test_db().await.expect("init db");
204 let checker = DedupChecker::new(pool.clone());
205
206 let reply = sample_reply("t1", "This is a great tool for developers and engineers");
207 insert_reply(&pool, &reply).await.expect("insert");
208
209 assert!(checker
211 .is_phrasing_similar("This is a great tool for developers and designers", 20)
212 .await
213 .expect("check"));
214 }
215
216 #[tokio::test]
217 async fn is_phrasing_similar_no_overlap() {
218 let pool = init_test_db().await.expect("init db");
219 let checker = DedupChecker::new(pool.clone());
220
221 let reply = sample_reply("t1", "This is a great tool for developers and engineers");
222 insert_reply(&pool, &reply).await.expect("insert");
223
224 assert!(!checker
225 .is_phrasing_similar("I love cooking pasta with fresh basil and tomatoes", 20)
226 .await
227 .expect("check"));
228 }
229
230 #[tokio::test]
231 async fn is_phrasing_similar_empty_string() {
232 let pool = init_test_db().await.expect("init db");
233 let checker = DedupChecker::new(pool.clone());
234
235 let reply = sample_reply("t1", "Some reply");
236 insert_reply(&pool, &reply).await.expect("insert");
237
238 assert!(!checker.is_phrasing_similar("", 20).await.expect("check"));
239 }
240
241 #[tokio::test]
242 async fn is_phrasing_similar_short_reply_skips_similarity() {
243 let pool = init_test_db().await.expect("init db");
244 let checker = DedupChecker::new(pool.clone());
245
246 let reply = sample_reply("t1", "Great point!");
247 insert_reply(&pool, &reply).await.expect("insert");
248
249 assert!(checker
251 .is_phrasing_similar("Great point!", 20)
252 .await
253 .expect("check"));
254
255 assert!(!checker
257 .is_phrasing_similar("Good point!", 20)
258 .await
259 .expect("check"));
260 }
261
262 #[tokio::test]
263 async fn is_phrasing_similar_no_recent_replies() {
264 let pool = init_test_db().await.expect("init db");
265 let checker = DedupChecker::new(pool.clone());
266
267 assert!(!checker
268 .is_phrasing_similar("Any reply text here that is long enough to test", 20)
269 .await
270 .expect("check"));
271 }
272
273 #[tokio::test]
274 async fn get_recent_reply_phrases_works() {
275 let pool = init_test_db().await.expect("init db");
276 let checker = DedupChecker::new(pool.clone());
277
278 let r1 = sample_reply("t1", "Reply one");
279 let r2 = sample_reply("t2", "Reply two");
280 insert_reply(&pool, &r1).await.expect("ins1");
281 insert_reply(&pool, &r2).await.expect("ins2");
282
283 let phrases = checker.get_recent_reply_phrases(5).await.expect("get");
284 assert_eq!(phrases.len(), 2);
285 }
286}