#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ClusterAction {
KeepRepresentative,
ManualReview,
DeleteAll,
NoAction,
}
impl ClusterAction {
#[must_use]
pub fn label(&self) -> &'static str {
match self {
Self::KeepRepresentative => "keep_representative",
Self::ManualReview => "manual_review",
Self::DeleteAll => "delete_all",
Self::NoAction => "no_action",
}
}
}
#[derive(Debug, Clone)]
pub struct SimilarityEdge {
pub member_a: usize,
pub member_b: usize,
pub score: f64,
}
impl SimilarityEdge {
#[must_use]
pub fn new(member_a: usize, member_b: usize, score: f64) -> Self {
Self {
member_a,
member_b,
score,
}
}
}
#[derive(Debug, Clone)]
pub struct NearDuplicateCluster {
pub id: usize,
pub members: Vec<String>,
pub edges: Vec<SimilarityEdge>,
pub representative: Option<String>,
pub confidence: f64,
pub action: ClusterAction,
}
impl NearDuplicateCluster {
#[must_use]
pub fn new(id: usize) -> Self {
Self {
id,
members: Vec::new(),
edges: Vec::new(),
representative: None,
confidence: 0.0,
action: ClusterAction::NoAction,
}
}
#[must_use]
pub fn size(&self) -> usize {
self.members.len()
}
#[must_use]
pub fn is_duplicate_group(&self) -> bool {
self.members.len() >= 2
}
#[must_use]
pub fn min_similarity(&self) -> f64 {
self.edges
.iter()
.map(|e| e.score)
.fold(f64::INFINITY, f64::min)
.min(1.0) }
#[must_use]
pub fn max_similarity(&self) -> f64 {
self.edges
.iter()
.map(|e| e.score)
.fold(f64::NEG_INFINITY, f64::max)
.max(0.0)
}
#[must_use]
pub fn mean_similarity(&self) -> f64 {
if self.edges.is_empty() {
return 0.0;
}
let sum: f64 = self.edges.iter().map(|e| e.score).sum();
sum / self.edges.len() as f64
}
#[must_use]
pub fn std_similarity(&self) -> f64 {
if self.edges.len() < 2 {
return 0.0;
}
let mean = self.mean_similarity();
let variance: f64 = self
.edges
.iter()
.map(|e| (e.score - mean).powi(2))
.sum::<f64>()
/ self.edges.len() as f64;
variance.sqrt()
}
pub fn select_representative(&mut self) {
if self.members.is_empty() {
return;
}
if self.members.len() == 1 {
self.representative = Some(self.members[0].clone());
return;
}
let n = self.members.len();
let mut total_score = vec![0.0f64; n];
let mut degree = vec![0usize; n];
for edge in &self.edges {
if edge.member_a < n && edge.member_b < n {
total_score[edge.member_a] += edge.score;
total_score[edge.member_b] += edge.score;
degree[edge.member_a] += 1;
degree[edge.member_b] += 1;
}
}
let centrality: Vec<f64> = total_score
.iter()
.zip(degree.iter())
.map(|(s, &d)| if d > 0 { *s / d as f64 } else { 0.0 })
.collect();
let best_idx = centrality
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, _)| i)
.unwrap_or(0);
self.representative = Some(self.members[best_idx].clone());
}
pub fn compute_confidence(&mut self) {
self.confidence = self.mean_similarity().clamp(0.0, 1.0);
}
pub fn assign_action(&mut self, certainty_threshold: f64, exact_threshold: f64) {
if !self.is_duplicate_group() {
self.action = ClusterAction::NoAction;
return;
}
if self.min_similarity() >= exact_threshold {
self.action = ClusterAction::DeleteAll;
} else if self.confidence >= certainty_threshold {
self.action = ClusterAction::KeepRepresentative;
} else {
self.action = ClusterAction::ManualReview;
}
}
}
#[derive(Debug, Clone)]
pub struct UnionFind {
parent: Vec<usize>,
rank: Vec<usize>,
pub component_count: usize,
}
impl UnionFind {
#[must_use]
pub fn new(n: usize) -> Self {
Self {
parent: (0..n).collect(),
rank: vec![0; n],
component_count: n,
}
}
pub fn find(&mut self, x: usize) -> usize {
let mut root = x;
while self.parent[root] != root {
root = self.parent[root];
}
let mut cur = x;
while cur != root {
let next = self.parent[cur];
self.parent[cur] = root;
cur = next;
}
root
}
pub fn union(&mut self, a: usize, b: usize) -> bool {
let ra = self.find(a);
let rb = self.find(b);
if ra == rb {
return false;
}
match self.rank[ra].cmp(&self.rank[rb]) {
std::cmp::Ordering::Less => self.parent[ra] = rb,
std::cmp::Ordering::Greater => self.parent[rb] = ra,
std::cmp::Ordering::Equal => {
self.parent[rb] = ra;
self.rank[ra] += 1;
}
}
self.component_count -= 1;
true
}
pub fn connected(&mut self, a: usize, b: usize) -> bool {
self.find(a) == self.find(b)
}
#[must_use]
pub fn len(&self) -> usize {
self.parent.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.parent.is_empty()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LinkageStrategy {
#[default]
SingleLinkage,
CompleteLinkage,
AverageLinkage,
}
#[derive(Debug, Clone)]
pub struct NearDuplicateClusterer {
pub similarity_threshold: f64,
pub linkage: LinkageStrategy,
pub certainty_threshold: f64,
pub exact_threshold: f64,
}
impl NearDuplicateClusterer {
#[must_use]
pub fn new(similarity_threshold: f64) -> Self {
Self {
similarity_threshold,
linkage: LinkageStrategy::SingleLinkage,
certainty_threshold: 0.85,
exact_threshold: 0.995,
}
}
#[must_use]
pub fn with_linkage(mut self, linkage: LinkageStrategy) -> Self {
self.linkage = linkage;
self
}
#[must_use]
pub fn with_certainty_threshold(mut self, t: f64) -> Self {
self.certainty_threshold = t;
self
}
#[must_use]
pub fn with_exact_threshold(mut self, t: f64) -> Self {
self.exact_threshold = t;
self
}
#[must_use]
pub fn cluster(&self, pairs: &[(String, String, f64)]) -> Vec<NearDuplicateCluster> {
if pairs.is_empty() {
return Vec::new();
}
let mut key_to_idx: HashMap<&str, usize> = HashMap::new();
let mut idx_to_key: Vec<&str> = Vec::new();
for (a, b, _) in pairs {
if !key_to_idx.contains_key(a.as_str()) {
let i = idx_to_key.len();
key_to_idx.insert(a.as_str(), i);
idx_to_key.push(a.as_str());
}
if !key_to_idx.contains_key(b.as_str()) {
let i = idx_to_key.len();
key_to_idx.insert(b.as_str(), i);
idx_to_key.push(b.as_str());
}
}
let n = idx_to_key.len();
let mut uf = UnionFind::new(n);
let valid_pairs: Vec<(usize, usize, f64)> = pairs
.iter()
.filter_map(|(a, b, s)| {
if *s >= self.similarity_threshold {
Some((key_to_idx[a.as_str()], key_to_idx[b.as_str()], *s))
} else {
None
}
})
.collect();
for &(a, b, _) in &valid_pairs {
uf.union(a, b);
}
let mut group_map: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..n {
let root = uf.find(i);
group_map.entry(root).or_default().push(i);
}
let mut clusters: Vec<NearDuplicateCluster> = group_map
.into_values()
.filter(|members| members.len() >= 2)
.enumerate()
.map(|(cid, members)| {
let mut cluster = NearDuplicateCluster::new(cid);
let local_idx: HashMap<usize, usize> = members
.iter()
.enumerate()
.map(|(li, &gi)| (gi, li))
.collect();
for &gi in &members {
cluster.members.push(idx_to_key[gi].to_string());
}
for &(a, b, score) in &valid_pairs {
if let (Some(&la), Some(&lb)) = (local_idx.get(&a), local_idx.get(&b)) {
cluster.edges.push(SimilarityEdge::new(la, lb, score));
}
}
cluster.select_representative();
cluster.compute_confidence();
cluster.assign_action(self.certainty_threshold, self.exact_threshold);
cluster
})
.collect();
clusters.sort_by_key(|c| c.id);
clusters
}
}
#[derive(Debug, Clone)]
pub struct ClusterStats {
pub cluster_count: usize,
pub total_members: usize,
pub max_cluster_size: usize,
pub mean_cluster_size: f64,
pub mean_confidence: f64,
pub action_counts: HashMap<String, usize>,
pub total_edges: usize,
}
impl ClusterStats {
#[must_use]
pub fn from_clusters(clusters: &[NearDuplicateCluster]) -> Self {
let dup_clusters: Vec<&NearDuplicateCluster> =
clusters.iter().filter(|c| c.is_duplicate_group()).collect();
let cluster_count = dup_clusters.len();
let total_members: usize = dup_clusters.iter().map(|c| c.size()).sum();
let max_cluster_size = dup_clusters.iter().map(|c| c.size()).max().unwrap_or(0);
let mean_cluster_size = if cluster_count > 0 {
total_members as f64 / cluster_count as f64
} else {
0.0
};
let mean_confidence = if cluster_count > 0 {
dup_clusters.iter().map(|c| c.confidence).sum::<f64>() / cluster_count as f64
} else {
0.0
};
let mut action_counts: HashMap<String, usize> = HashMap::new();
for c in &dup_clusters {
*action_counts
.entry(c.action.label().to_string())
.or_insert(0) += 1;
}
let total_edges: usize = dup_clusters.iter().map(|c| c.edges.len()).sum();
Self {
cluster_count,
total_members,
max_cluster_size,
mean_cluster_size,
mean_confidence,
action_counts,
total_edges,
}
}
#[must_use]
pub fn summary(&self) -> String {
format!(
"{} clusters | {} members | max_size={} | mean_conf={:.3} | edges={}",
self.cluster_count,
self.total_members,
self.max_cluster_size,
self.mean_confidence,
self.total_edges,
)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn s(v: &str) -> String {
v.to_string()
}
#[test]
fn test_union_find_basic() {
let mut uf = UnionFind::new(5);
assert_eq!(uf.component_count, 5);
assert!(uf.union(0, 1));
assert!(uf.union(1, 2));
assert!(!uf.union(0, 2)); assert_eq!(uf.component_count, 3);
assert!(uf.connected(0, 2));
assert!(!uf.connected(0, 3));
}
#[test]
fn test_union_find_path_compression() {
let mut uf = UnionFind::new(10);
for i in 0..9 {
uf.union(i, i + 1);
}
for i in 1..10 {
assert!(uf.connected(0, i));
}
assert_eq!(uf.component_count, 1);
}
#[test]
fn test_cluster_new_and_size() {
let c = NearDuplicateCluster::new(0);
assert_eq!(c.size(), 0);
assert!(!c.is_duplicate_group());
assert_eq!(c.action, ClusterAction::NoAction);
}
#[test]
fn test_cluster_mean_similarity_empty() {
let c = NearDuplicateCluster::new(0);
assert_eq!(c.mean_similarity(), 0.0);
assert_eq!(c.std_similarity(), 0.0);
}
#[test]
fn test_cluster_statistics() {
let mut c = NearDuplicateCluster::new(0);
c.members = vec![s("a"), s("b"), s("c")];
c.edges = vec![
SimilarityEdge::new(0, 1, 0.9),
SimilarityEdge::new(1, 2, 0.8),
SimilarityEdge::new(0, 2, 0.85),
];
let mean = c.mean_similarity();
assert!((mean - 0.85).abs() < 1e-9);
assert!(c.std_similarity() > 0.0);
assert!(c.min_similarity() < c.max_similarity());
}
#[test]
fn test_cluster_select_representative() {
let mut c = NearDuplicateCluster::new(0);
c.members = vec![s("a"), s("b"), s("c")];
c.edges = vec![
SimilarityEdge::new(0, 1, 0.9),
SimilarityEdge::new(1, 2, 0.95),
];
c.select_representative();
assert_eq!(c.representative, Some(s("c")));
}
#[test]
fn test_cluster_assign_action_no_action() {
let mut c = NearDuplicateCluster::new(0);
c.members = vec![s("only")];
c.assign_action(0.85, 0.995);
assert_eq!(c.action, ClusterAction::NoAction);
}
#[test]
fn test_cluster_assign_action_delete_all() {
let mut c = NearDuplicateCluster::new(0);
c.members = vec![s("a"), s("b")];
c.edges = vec![SimilarityEdge::new(0, 1, 0.999)];
c.compute_confidence();
c.assign_action(0.85, 0.995);
assert_eq!(c.action, ClusterAction::DeleteAll);
}
#[test]
fn test_cluster_assign_action_manual_review() {
let mut c = NearDuplicateCluster::new(0);
c.members = vec![s("a"), s("b")];
c.edges = vec![SimilarityEdge::new(0, 1, 0.70)];
c.compute_confidence();
c.assign_action(0.85, 0.995);
assert_eq!(c.action, ClusterAction::ManualReview);
}
#[test]
fn test_clusterer_transitive_chain() {
let pairs = vec![(s("a"), s("b"), 0.95), (s("b"), s("c"), 0.92)];
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters = clusterer.cluster(&pairs);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].size(), 3);
}
#[test]
fn test_clusterer_separate_clusters() {
let pairs = vec![(s("a"), s("b"), 0.95), (s("x"), s("y"), 0.92)];
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters = clusterer.cluster(&pairs);
assert_eq!(clusters.len(), 2);
}
#[test]
fn test_clusterer_threshold_filters() {
let pairs = vec![(s("a"), s("b"), 0.95), (s("c"), s("d"), 0.50)];
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters = clusterer.cluster(&pairs);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].size(), 2);
}
#[test]
fn test_clusterer_empty_pairs() {
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters: Vec<NearDuplicateCluster> = clusterer.cluster(&[]);
assert!(clusters.is_empty());
}
#[test]
fn test_clusterer_representative_assigned() {
let pairs = vec![(s("a"), s("b"), 0.96)];
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters = clusterer.cluster(&pairs);
assert!(clusters[0].representative.is_some());
}
#[test]
fn test_cluster_stats_empty() {
let stats = ClusterStats::from_clusters(&[]);
assert_eq!(stats.cluster_count, 0);
assert_eq!(stats.total_members, 0);
assert_eq!(stats.mean_confidence, 0.0);
}
#[test]
fn test_cluster_stats_with_data() {
let pairs = vec![
(s("a"), s("b"), 0.96),
(s("c"), s("d"), 0.93),
(s("e"), s("f"), 0.91),
];
let clusterer = NearDuplicateClusterer::new(0.90);
let clusters = clusterer.cluster(&pairs);
let stats = ClusterStats::from_clusters(&clusters);
assert_eq!(stats.cluster_count, 3);
assert_eq!(stats.total_members, 6);
assert_eq!(stats.max_cluster_size, 2);
assert!(stats.mean_confidence > 0.0);
assert!(!stats.summary().is_empty());
}
#[test]
fn test_linkage_strategy_default() {
let strategy = LinkageStrategy::default();
assert_eq!(strategy, LinkageStrategy::SingleLinkage);
}
#[test]
fn test_cluster_action_labels() {
assert_eq!(
ClusterAction::KeepRepresentative.label(),
"keep_representative"
);
assert_eq!(ClusterAction::ManualReview.label(), "manual_review");
assert_eq!(ClusterAction::DeleteAll.label(), "delete_all");
assert_eq!(ClusterAction::NoAction.label(), "no_action");
}
}