1use crate::domain::note::tokenize;
8use crate::domain::{MemoryLifecycleState, MemoryRecord};
9use serde::Serialize;
10use std::collections::BTreeSet;
11
12#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
13pub struct ContradictionHit {
14 pub existing_record_id: String,
15 pub existing_title: String,
16 pub signal: ContradictionSignal,
17}
18
19#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
20#[serde(rename_all = "snake_case")]
21pub enum ContradictionSignal {
22 Negation,
23 Replacement,
24}
25
26const OVERLAP_THRESHOLD: f64 = 0.3;
29
30const MIN_TOKENS: usize = 2;
32
33const ZH_NEGATION: &[&str] = &[
35 "不", "没", "别", "勿", "停止", "取消", "禁止", "不要", "不再",
36];
37
38const EN_NEGATION: &[&str] = &[
40 "not",
41 "don't",
42 "never",
43 "stop",
44 "cancel",
45 "disable",
46 "remove",
47 "no longer",
48];
49
50const ZH_REPLACEMENT: &[&str] = &["替代", "改用", "换成", "替换", "而不是", "弃用"];
52
53const EN_REPLACEMENT: &[&str] = &[
55 "instead of",
56 "replace",
57 "switch to",
58 "migrate to",
59 "move from",
60 "rather than",
61];
62
63pub fn detect(
67 new_summary: &str,
68 new_memory_type: &str,
69 existing: &[(String, MemoryRecord)],
70) -> Vec<ContradictionHit> {
71 let new_tokens = tokenize(new_summary);
72 if new_tokens.len() < MIN_TOKENS {
73 return Vec::new();
74 }
75 let new_lower = new_summary.to_lowercase();
76
77 let mut hits = Vec::new();
78
79 for (record_id, record) in existing {
80 if record.memory_type != new_memory_type {
81 continue;
82 }
83 if !matches!(
84 record.state,
85 MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
86 ) {
87 continue;
88 }
89
90 let existing_tokens = tokenize(&record.summary);
91 if existing_tokens.len() < MIN_TOKENS {
92 continue;
93 }
94
95 let similarity = jaccard(&new_tokens, &existing_tokens);
96 if similarity < OVERLAP_THRESHOLD {
97 continue;
98 }
99
100 let has_replacement = contains_any_marker(&new_lower, ZH_REPLACEMENT)
101 || contains_any_marker(&new_lower, EN_REPLACEMENT);
102 let has_negation = contains_any_marker(&new_lower, ZH_NEGATION)
103 || contains_any_marker(&new_lower, EN_NEGATION);
104
105 let signal = if has_replacement {
107 Some(ContradictionSignal::Replacement)
108 } else if has_negation {
109 Some(ContradictionSignal::Negation)
110 } else {
111 None
112 };
113
114 if let Some(signal) = signal {
115 hits.push(ContradictionHit {
116 existing_record_id: record_id.clone(),
117 existing_title: record.title.clone(),
118 signal,
119 });
120 }
121 }
122
123 hits
124}
125
126fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
127 let intersection = a.intersection(b).count();
128 let union = a.union(b).count();
129 if union == 0 {
130 return 0.0;
131 }
132 intersection as f64 / union as f64
133}
134
135fn contains_any_marker(text: &str, markers: &[&str]) -> bool {
136 markers.iter().any(|marker| text.contains(marker))
137}
138
139#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
140pub struct DedupSuggestion {
141 pub record_id_a: String,
142 pub record_id_b: String,
143 pub title_a: String,
144 pub title_b: String,
145 pub similarity: u32,
146}
147
148pub fn find_duplicates(records: &[(String, MemoryRecord)], threshold: f64) -> Vec<DedupSuggestion> {
149 let mut suggestions = Vec::new();
150 let active: Vec<_> = records
151 .iter()
152 .filter(|(_, r)| {
153 matches!(
154 r.state,
155 MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
156 )
157 })
158 .collect();
159
160 for i in 0..active.len() {
161 let tokens_a = tokenize(&active[i].1.summary);
162 if tokens_a.len() < MIN_TOKENS {
163 continue;
164 }
165 for j in (i + 1)..active.len() {
166 if active[i].1.memory_type != active[j].1.memory_type {
167 continue;
168 }
169 let tokens_b = tokenize(&active[j].1.summary);
170 if tokens_b.len() < MIN_TOKENS {
171 continue;
172 }
173 let sim = jaccard(&tokens_a, &tokens_b);
174 if sim >= threshold {
175 suggestions.push(DedupSuggestion {
176 record_id_a: active[i].0.clone(),
177 record_id_b: active[j].0.clone(),
178 title_a: active[i].1.title.clone(),
179 title_b: active[j].1.title.clone(),
180 similarity: (sim * 100.0) as u32,
181 });
182 }
183 }
184 }
185 suggestions.sort_by_key(|s| std::cmp::Reverse(s.similarity));
186 suggestions
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192 use crate::domain::{MemoryLifecycleState, MemoryPromotionAction, MemoryRecord, MemoryScope};
193
194 fn record(
195 title: &str,
196 summary: &str,
197 memory_type: &str,
198 state: MemoryLifecycleState,
199 ) -> MemoryRecord {
200 let base = MemoryRecord::new_manual(title, summary, memory_type, MemoryScope::User, "test");
201 match state {
202 MemoryLifecycleState::Accepted => base,
203 MemoryLifecycleState::Canonical => {
204 base.apply(MemoryPromotionAction::PromoteToCanonical)
205 }
206 MemoryLifecycleState::Candidate => MemoryRecord::new_ai_proposal(
207 title,
208 summary,
209 memory_type,
210 MemoryScope::User,
211 "test",
212 ),
213 MemoryLifecycleState::Archived => base.apply(MemoryPromotionAction::Archive),
214 MemoryLifecycleState::Draft => {
215 let mut r = MemoryRecord::new_ai_proposal(
216 title,
217 summary,
218 memory_type,
219 MemoryScope::User,
220 "test",
221 );
222 r.state = MemoryLifecycleState::Draft;
223 r
224 }
225 }
226 }
227
228 fn existing_list(items: Vec<(&str, MemoryRecord)>) -> Vec<(String, MemoryRecord)> {
229 items
230 .into_iter()
231 .map(|(id, r)| (id.to_string(), r))
232 .collect()
233 }
234
235 #[test]
236 fn detect_finds_negation_same_type() {
237 let existing = existing_list(vec![(
238 "rec-1",
239 record(
240 "用 cargo install",
241 "用 cargo install 安装 binary 到 ~/.cargo/bin",
242 "preference",
243 MemoryLifecycleState::Accepted,
244 ),
245 )]);
246
247 let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
248 assert_eq!(hits.len(), 1);
249 assert_eq!(hits[0].existing_record_id, "rec-1");
250 assert_eq!(hits[0].signal, ContradictionSignal::Negation);
251 }
252
253 #[test]
254 fn detect_finds_replacement() {
255 let existing = existing_list(vec![(
256 "rec-2",
257 record(
258 "用 React",
259 "前端框架用 React 构建 UI 组件",
260 "preference",
261 MemoryLifecycleState::Accepted,
262 ),
263 )]);
264
265 let hits = detect("改用 Vue 替代 React 构建 UI 组件", "preference", &existing);
266 assert_eq!(hits.len(), 1);
267 assert_eq!(hits[0].existing_record_id, "rec-2");
268 assert_eq!(hits[0].signal, ContradictionSignal::Replacement);
269 }
270
271 #[test]
272 fn detect_skips_different_type() {
273 let existing = existing_list(vec![(
274 "rec-3",
275 record(
276 "用 cargo install",
277 "用 cargo install 安装 binary 到 ~/.cargo/bin",
278 "workflow",
279 MemoryLifecycleState::Accepted,
280 ),
281 )]);
282
283 let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
284 assert!(hits.is_empty());
285 }
286
287 #[test]
288 fn detect_skips_low_overlap() {
289 let existing = existing_list(vec![(
290 "rec-4",
291 record(
292 "用 cargo install",
293 "用 cargo install 安装 binary 到 ~/.cargo/bin",
294 "preference",
295 MemoryLifecycleState::Accepted,
296 ),
297 )]);
298
299 let hits = detect("不要在周末加班写代码", "preference", &existing);
300 assert!(hits.is_empty());
301 }
302
303 #[test]
304 fn detect_skips_archived() {
305 let existing = existing_list(vec![(
306 "rec-5",
307 record(
308 "用 cargo install",
309 "用 cargo install 安装 binary 到 ~/.cargo/bin",
310 "preference",
311 MemoryLifecycleState::Archived,
312 ),
313 )]);
314
315 let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
316 assert!(hits.is_empty());
317 }
318
319 #[test]
320 fn detect_skips_candidate_state() {
321 let existing = existing_list(vec![(
322 "rec-6",
323 record(
324 "用 cargo install",
325 "用 cargo install 安装 binary 到 ~/.cargo/bin",
326 "preference",
327 MemoryLifecycleState::Candidate,
328 ),
329 )]);
330
331 let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
332 assert!(hits.is_empty());
333 }
334
335 #[test]
336 fn detect_handles_empty_existing() {
337 let hits = detect("不用 cargo install", "preference", &[]);
338 assert!(hits.is_empty());
339 }
340
341 #[test]
342 fn detect_english_negation() {
343 let existing = existing_list(vec![(
344 "rec-7",
345 record(
346 "Use JWT",
347 "use JWT tokens for API authentication",
348 "preference",
349 MemoryLifecycleState::Accepted,
350 ),
351 )]);
352
353 let hits = detect(
354 "don't use JWT tokens for API authentication, use sessions",
355 "preference",
356 &existing,
357 );
358 assert_eq!(hits.len(), 1);
359 assert_eq!(hits[0].existing_record_id, "rec-7");
360 assert_eq!(hits[0].signal, ContradictionSignal::Negation);
361 }
362
363 #[test]
364 fn detect_english_replacement() {
365 let existing = existing_list(vec![(
366 "rec-8",
367 record(
368 "Deploy to AWS",
369 "deploy all services to AWS infrastructure",
370 "preference",
371 MemoryLifecycleState::Canonical,
372 ),
373 )]);
374
375 let hits = detect(
376 "migrate to GCP instead of AWS for all services infrastructure",
377 "preference",
378 &existing,
379 );
380 assert_eq!(hits.len(), 1);
381 assert_eq!(hits[0].existing_record_id, "rec-8");
382 assert_eq!(hits[0].signal, ContradictionSignal::Replacement);
383 }
384}