1#![allow(clippy::cast_precision_loss)]
25
26use std::collections::BTreeSet;
27
28use crate::check::{CheckOutcome, MatchKind};
29
30pub const LINK_THRESHOLD: f64 = 0.5;
32const MIN_TOKEN_LEN: usize = 2;
34
35#[derive(Debug, Clone)]
37pub struct Cluster {
38 pub members: Vec<String>,
40 pub confidence: f64,
42 pub shared_name: Option<String>,
44}
45
46#[derive(Debug, Clone, Default)]
48pub struct CorrelationReport {
49 pub clusters: Vec<Cluster>,
51 pub unlinked: Vec<String>,
53 pub without_profile: Vec<String>,
55}
56
57struct Node<'a> {
58 site: &'a str,
59 name: Option<String>,
60 name_tokens: BTreeSet<String>,
61 bio_tokens: BTreeSet<String>,
62}
63
64#[must_use]
66pub fn correlate(outcomes: &[CheckOutcome]) -> CorrelationReport {
67 let mut report = CorrelationReport::default();
68
69 let mut nodes: Vec<Node<'_>> = Vec::new();
70 for outcome in outcomes.iter().filter(|o| o.kind == MatchKind::Found) {
71 let name = outcome.enrichment.get("name");
72 let bio = outcome.enrichment.get("bio");
73 if name.is_none() && bio.is_none() {
74 report.without_profile.push(outcome.site.clone());
75 continue;
76 }
77 nodes.push(Node {
78 site: &outcome.site,
79 name: name.map(|n| normalize(n)),
80 name_tokens: name.map(|n| tokenize(n)).unwrap_or_default(),
81 bio_tokens: bio.map(|b| tokenize(b)).unwrap_or_default(),
82 });
83 }
84
85 let mut uf = UnionFind::new(nodes.len());
86 let mut edges: Vec<(usize, usize, f64)> = Vec::new();
88 for a in 0..nodes.len() {
89 for b in (a + 1)..nodes.len() {
90 let score = pair_score(&nodes[a], &nodes[b]);
91 if score >= LINK_THRESHOLD {
92 uf.union(a, b);
93 edges.push((a, b, score));
94 }
95 }
96 }
97
98 let mut by_root: std::collections::HashMap<usize, Vec<usize>> =
100 std::collections::HashMap::new();
101 for i in 0..nodes.len() {
102 by_root.entry(uf.find(i)).or_default().push(i);
103 }
104
105 for (root, members) in by_root {
106 if members.len() < 2 {
107 for &i in &members {
109 report.unlinked.push(nodes[i].site.to_owned());
110 }
111 continue;
112 }
113 let scores: Vec<f64> = edges
114 .iter()
115 .filter(|(a, _, _)| uf_root_eq(&mut uf, *a, root))
116 .map(|(_, _, s)| *s)
117 .collect();
118 let confidence = if scores.is_empty() {
119 0.0
120 } else {
121 scores.iter().sum::<f64>() / scores.len() as f64
122 };
123 let mut member_sites: Vec<String> =
124 members.iter().map(|&i| nodes[i].site.to_owned()).collect();
125 member_sites.sort_unstable();
126 report.clusters.push(Cluster {
127 members: member_sites,
128 confidence,
129 shared_name: shared_name(&members, &nodes),
130 });
131 }
132
133 report.clusters.sort_by(|a, b| {
134 b.confidence
135 .partial_cmp(&a.confidence)
136 .unwrap_or(std::cmp::Ordering::Equal)
137 .then_with(|| a.members.cmp(&b.members))
138 });
139 report.unlinked.sort_unstable();
140 report.without_profile.sort_unstable();
141 report
142}
143
144fn uf_root_eq(uf: &mut UnionFind, node: usize, root: usize) -> bool {
145 uf.find(node) == root
146}
147
148fn shared_name(members: &[usize], nodes: &[Node<'_>]) -> Option<String> {
149 let first = nodes[members[0]].name.clone()?;
150 if first.is_empty() {
151 return None;
152 }
153 members
154 .iter()
155 .all(|&i| nodes[i].name.as_deref() == Some(first.as_str()))
156 .then_some(first)
157}
158
159fn pair_score(a: &Node<'_>, b: &Node<'_>) -> f64 {
160 let mut signals: Vec<f64> = Vec::new();
161 if a.name.is_some() && b.name.is_some() {
162 let name_sim = if a.name == b.name {
163 1.0
164 } else {
165 jaccard(&a.name_tokens, &b.name_tokens)
166 };
167 signals.push(name_sim);
168 }
169 if !a.bio_tokens.is_empty() && !b.bio_tokens.is_empty() {
170 signals.push(jaccard(&a.bio_tokens, &b.bio_tokens));
171 }
172 if signals.is_empty() {
173 0.0
174 } else {
175 signals.iter().sum::<f64>() / signals.len() as f64
176 }
177}
178
179fn normalize(s: &str) -> String {
180 s.split_whitespace()
181 .collect::<Vec<_>>()
182 .join(" ")
183 .to_lowercase()
184}
185
186fn tokenize(s: &str) -> BTreeSet<String> {
187 s.split(|c: char| !c.is_alphanumeric())
188 .filter(|t| t.chars().count() >= MIN_TOKEN_LEN)
189 .map(str::to_lowercase)
190 .collect()
191}
192
193fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
194 if a.is_empty() && b.is_empty() {
195 return 0.0;
196 }
197 let inter = a.intersection(b).count();
198 let union = a.union(b).count();
199 if union == 0 {
200 0.0
201 } else {
202 inter as f64 / union as f64
203 }
204}
205
206struct UnionFind {
207 parent: Vec<usize>,
208}
209
210impl UnionFind {
211 fn new(n: usize) -> Self {
212 Self {
213 parent: (0..n).collect(),
214 }
215 }
216
217 fn find(&mut self, x: usize) -> usize {
218 let mut root = x;
219 while self.parent[root] != root {
220 root = self.parent[root];
221 }
222 let mut cur = x;
224 while self.parent[cur] != root {
225 let next = self.parent[cur];
226 self.parent[cur] = root;
227 cur = next;
228 }
229 root
230 }
231
232 fn union(&mut self, a: usize, b: usize) {
233 let (ra, rb) = (self.find(a), self.find(b));
234 if ra != rb {
235 self.parent[ra] = rb;
236 }
237 }
238}
239
240#[cfg(test)]
241mod tests {
242 use super::*;
243 use std::collections::BTreeMap;
244
245 fn found(site: &str, fields: &[(&str, &str)]) -> CheckOutcome {
246 let mut enrichment = BTreeMap::new();
247 for (k, v) in fields {
248 enrichment.insert((*k).to_owned(), (*v).to_owned());
249 }
250 CheckOutcome {
251 site: site.into(),
252 url: format!("https://{site}.example/u"),
253 kind: MatchKind::Found,
254 reason: None,
255 elapsed_ms: 1,
256 enrichment,
257 evidence: Vec::new(),
258 }
259 }
260
261 #[test]
262 fn links_accounts_with_matching_name_and_bio() {
263 let outcomes = vec![
264 found(
265 "GitHub",
266 &[("name", "Alice Liddell"), ("bio", "Rust systems hacker")],
267 ),
268 found(
269 "GitLab",
270 &[("name", "Alice Liddell"), ("bio", "systems hacker, Rust")],
271 ),
272 ];
273 let report = correlate(&outcomes);
274 assert_eq!(report.clusters.len(), 1);
275 let c = &report.clusters[0];
276 assert_eq!(c.members, ["GitHub", "GitLab"]);
277 assert!(c.confidence > 0.7, "confidence {}", c.confidence);
278 assert_eq!(c.shared_name.as_deref(), Some("alice liddell"));
279 }
280
281 #[test]
282 fn does_not_link_different_people_sharing_a_handle() {
283 let outcomes = vec![
284 found(
285 "GitHub",
286 &[("name", "Alice Liddell"), ("bio", "Rust systems hacker")],
287 ),
288 found(
289 "Twitch",
290 &[("name", "Bob Jones"), ("bio", "pro gamer and streamer")],
291 ),
292 ];
293 let report = correlate(&outcomes);
294 assert!(report.clusters.is_empty(), "should not link: {report:?}");
295 assert_eq!(report.unlinked.len(), 2);
296 }
297
298 #[test]
299 fn accounts_without_profile_are_separated() {
300 let outcomes = vec![
301 found("GitHub", &[("name", "Alice Liddell")]),
302 found("Vimeo", &[]),
303 found("HackerNews", &[]),
304 ];
305 let report = correlate(&outcomes);
306 assert!(report.clusters.is_empty());
307 assert_eq!(report.unlinked, ["GitHub"]);
308 assert_eq!(report.without_profile, ["HackerNews", "Vimeo"]);
309 }
310
311 #[test]
312 fn transitive_links_form_one_cluster() {
313 let outcomes = vec![
314 found("A", &[("bio", "loves rust and coffee")]),
315 found("B", &[("bio", "rust and coffee enthusiast")]),
316 found("C", &[("bio", "coffee and rust forever")]),
317 ];
318 let report = correlate(&outcomes);
319 assert_eq!(report.clusters.len(), 1);
320 assert_eq!(report.clusters[0].members.len(), 3);
321 }
322
323 #[test]
324 fn ignores_not_found_outcomes() {
325 let mut nf = found("GitLab", &[("name", "Alice Liddell")]);
326 nf.kind = MatchKind::NotFound;
327 let outcomes = vec![found("GitHub", &[("name", "Alice Liddell")]), nf];
328 let report = correlate(&outcomes);
329 assert!(report.clusters.is_empty());
331 assert_eq!(report.unlinked, ["GitHub"]);
332 }
333
334 #[test]
335 fn jaccard_basics() {
336 let a = tokenize("rust and coffee");
337 let b = tokenize("rust and tea");
338 assert!((jaccard(&a, &b) - 0.5).abs() < 1e-9);
341 }
342}