use crate::domain::note::tokenize;
use crate::domain::{MemoryLifecycleState, MemoryRecord};
use serde::Serialize;
use std::collections::BTreeSet;
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct ContradictionHit {
pub existing_record_id: String,
pub existing_title: String,
pub signal: ContradictionSignal,
}
#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ContradictionSignal {
Negation,
Replacement,
}
const OVERLAP_THRESHOLD: f64 = 0.3;
const MIN_TOKENS: usize = 2;
const ZH_NEGATION: &[&str] = &[
"不", "没", "别", "勿", "停止", "取消", "禁止", "不要", "不再",
];
const EN_NEGATION: &[&str] = &[
"not",
"don't",
"never",
"stop",
"cancel",
"disable",
"remove",
"no longer",
];
const ZH_REPLACEMENT: &[&str] = &["替代", "改用", "换成", "替换", "而不是", "弃用"];
const EN_REPLACEMENT: &[&str] = &[
"instead of",
"replace",
"switch to",
"migrate to",
"move from",
"rather than",
];
pub fn detect(
new_summary: &str,
new_memory_type: &str,
existing: &[(String, MemoryRecord)],
) -> Vec<ContradictionHit> {
let new_tokens = tokenize(new_summary);
if new_tokens.len() < MIN_TOKENS {
return Vec::new();
}
let new_lower = new_summary.to_lowercase();
let mut hits = Vec::new();
for (record_id, record) in existing {
if record.memory_type != new_memory_type {
continue;
}
if !matches!(
record.state,
MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
) {
continue;
}
let existing_tokens = tokenize(&record.summary);
if existing_tokens.len() < MIN_TOKENS {
continue;
}
let similarity = jaccard(&new_tokens, &existing_tokens);
if similarity < OVERLAP_THRESHOLD {
continue;
}
let has_replacement = contains_any_marker(&new_lower, ZH_REPLACEMENT)
|| contains_any_marker(&new_lower, EN_REPLACEMENT);
let has_negation = contains_any_marker(&new_lower, ZH_NEGATION)
|| contains_any_marker(&new_lower, EN_NEGATION);
let signal = if has_replacement {
Some(ContradictionSignal::Replacement)
} else if has_negation {
Some(ContradictionSignal::Negation)
} else {
None
};
if let Some(signal) = signal {
hits.push(ContradictionHit {
existing_record_id: record_id.clone(),
existing_title: record.title.clone(),
signal,
});
}
}
hits
}
fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
let intersection = a.intersection(b).count();
let union = a.union(b).count();
if union == 0 {
return 0.0;
}
intersection as f64 / union as f64
}
fn contains_any_marker(text: &str, markers: &[&str]) -> bool {
markers.iter().any(|marker| text.contains(marker))
}
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct DedupSuggestion {
pub record_id_a: String,
pub record_id_b: String,
pub title_a: String,
pub title_b: String,
pub similarity: u32,
}
pub fn find_duplicates(records: &[(String, MemoryRecord)], threshold: f64) -> Vec<DedupSuggestion> {
let mut suggestions = Vec::new();
let active: Vec<_> = records
.iter()
.filter(|(_, r)| {
matches!(
r.state,
MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
)
})
.collect();
for i in 0..active.len() {
let tokens_a = tokenize(&active[i].1.summary);
if tokens_a.len() < MIN_TOKENS {
continue;
}
for j in (i + 1)..active.len() {
if active[i].1.memory_type != active[j].1.memory_type {
continue;
}
let tokens_b = tokenize(&active[j].1.summary);
if tokens_b.len() < MIN_TOKENS {
continue;
}
let sim = jaccard(&tokens_a, &tokens_b);
if sim >= threshold {
suggestions.push(DedupSuggestion {
record_id_a: active[i].0.clone(),
record_id_b: active[j].0.clone(),
title_a: active[i].1.title.clone(),
title_b: active[j].1.title.clone(),
similarity: (sim * 100.0) as u32,
});
}
}
}
suggestions.sort_by_key(|s| std::cmp::Reverse(s.similarity));
suggestions
}
#[cfg(test)]
mod tests {
use super::*;
use crate::domain::{MemoryLifecycleState, MemoryPromotionAction, MemoryRecord, MemoryScope};
fn record(
title: &str,
summary: &str,
memory_type: &str,
state: MemoryLifecycleState,
) -> MemoryRecord {
let base = MemoryRecord::new_manual(title, summary, memory_type, MemoryScope::User, "test");
match state {
MemoryLifecycleState::Accepted => base,
MemoryLifecycleState::Canonical => {
base.apply(MemoryPromotionAction::PromoteToCanonical)
}
MemoryLifecycleState::Candidate => MemoryRecord::new_ai_proposal(
title,
summary,
memory_type,
MemoryScope::User,
"test",
),
MemoryLifecycleState::Archived => base.apply(MemoryPromotionAction::Archive),
MemoryLifecycleState::Draft => {
let mut r = MemoryRecord::new_ai_proposal(
title,
summary,
memory_type,
MemoryScope::User,
"test",
);
r.state = MemoryLifecycleState::Draft;
r
}
}
}
fn existing_list(items: Vec<(&str, MemoryRecord)>) -> Vec<(String, MemoryRecord)> {
items
.into_iter()
.map(|(id, r)| (id.to_string(), r))
.collect()
}
#[test]
fn detect_finds_negation_same_type() {
let existing = existing_list(vec![(
"rec-1",
record(
"用 cargo install",
"用 cargo install 安装 binary 到 ~/.cargo/bin",
"preference",
MemoryLifecycleState::Accepted,
),
)]);
let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].existing_record_id, "rec-1");
assert_eq!(hits[0].signal, ContradictionSignal::Negation);
}
#[test]
fn detect_finds_replacement() {
let existing = existing_list(vec![(
"rec-2",
record(
"用 React",
"前端框架用 React 构建 UI 组件",
"preference",
MemoryLifecycleState::Accepted,
),
)]);
let hits = detect("改用 Vue 替代 React 构建 UI 组件", "preference", &existing);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].existing_record_id, "rec-2");
assert_eq!(hits[0].signal, ContradictionSignal::Replacement);
}
#[test]
fn detect_skips_different_type() {
let existing = existing_list(vec![(
"rec-3",
record(
"用 cargo install",
"用 cargo install 安装 binary 到 ~/.cargo/bin",
"workflow",
MemoryLifecycleState::Accepted,
),
)]);
let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
assert!(hits.is_empty());
}
#[test]
fn detect_skips_low_overlap() {
let existing = existing_list(vec![(
"rec-4",
record(
"用 cargo install",
"用 cargo install 安装 binary 到 ~/.cargo/bin",
"preference",
MemoryLifecycleState::Accepted,
),
)]);
let hits = detect("不要在周末加班写代码", "preference", &existing);
assert!(hits.is_empty());
}
#[test]
fn detect_skips_archived() {
let existing = existing_list(vec![(
"rec-5",
record(
"用 cargo install",
"用 cargo install 安装 binary 到 ~/.cargo/bin",
"preference",
MemoryLifecycleState::Archived,
),
)]);
let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
assert!(hits.is_empty());
}
#[test]
fn detect_skips_candidate_state() {
let existing = existing_list(vec![(
"rec-6",
record(
"用 cargo install",
"用 cargo install 安装 binary 到 ~/.cargo/bin",
"preference",
MemoryLifecycleState::Candidate,
),
)]);
let hits = detect("不用 cargo install 安装 binary", "preference", &existing);
assert!(hits.is_empty());
}
#[test]
fn detect_handles_empty_existing() {
let hits = detect("不用 cargo install", "preference", &[]);
assert!(hits.is_empty());
}
#[test]
fn detect_english_negation() {
let existing = existing_list(vec![(
"rec-7",
record(
"Use JWT",
"use JWT tokens for API authentication",
"preference",
MemoryLifecycleState::Accepted,
),
)]);
let hits = detect(
"don't use JWT tokens for API authentication, use sessions",
"preference",
&existing,
);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].existing_record_id, "rec-7");
assert_eq!(hits[0].signal, ContradictionSignal::Negation);
}
#[test]
fn detect_english_replacement() {
let existing = existing_list(vec![(
"rec-8",
record(
"Deploy to AWS",
"deploy all services to AWS infrastructure",
"preference",
MemoryLifecycleState::Canonical,
),
)]);
let hits = detect(
"migrate to GCP instead of AWS for all services infrastructure",
"preference",
&existing,
);
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].existing_record_id, "rec-8");
assert_eq!(hits[0].signal, ContradictionSignal::Replacement);
}
}