Skip to main content

adler_core/
correlate.rs

1//! Cross-account correlation from enrichment fields.
2//!
3//! A username search returns many `Found` accounts for the *same handle* —
4//! but the same handle can belong to different people on different sites.
5//! This module groups accounts that share enough profile signal (name, bio)
6//! to plausibly be the same identity, so an analyst can tell
7//! "all these are clearly one person" from "this handle is just popular".
8//!
9//! Signals are text-only by design: avatar perceptual hashing needs image
10//! decoding (heavy deps) and is deferred. Each pair of accounts that both
11//! carry profile data is scored 0..1:
12//!
13//! - **name**: 1.0 if normalised-equal, else token Jaccard.
14//! - **bio**: token Jaccard.
15//! - combined = mean of the signals present in both.
16//!
17//! Pairs at or above [`LINK_THRESHOLD`] are linked; connected accounts form
18//! a cluster (union-find). Cluster confidence is the mean linking score.
19//! Confidence is a heuristic triage aid, not proof.
20
21// All `usize as f64` casts here are over small counts (token-set sizes,
22// cluster/edge counts) used to form ratios; the 52-bit mantissa is never a
23// concern at these magnitudes.
24#![allow(clippy::cast_precision_loss)]
25
26use std::collections::BTreeSet;
27
28use crate::check::{CheckOutcome, MatchKind};
29
30/// Minimum pairwise score to link two accounts.
31pub const LINK_THRESHOLD: f64 = 0.5;
32/// Drop tokens shorter than this when building word sets (cuts noise).
33const MIN_TOKEN_LEN: usize = 2;
34
35/// A group of accounts that likely belong to the same person.
36#[derive(Debug, Clone)]
37pub struct Cluster {
38    /// Site names of the member accounts.
39    pub members: Vec<String>,
40    /// Mean linking score across the cluster's edges, in `0..=1`.
41    pub confidence: f64,
42    /// A normalised name shared by the whole cluster, if any.
43    pub shared_name: Option<String>,
44}
45
46/// Result of correlating a scan's outcomes.
47#[derive(Debug, Clone, Default)]
48pub struct CorrelationReport {
49    /// Clusters of size ≥ 2 (actual cross-site links).
50    pub clusters: Vec<Cluster>,
51    /// Found accounts that carry profile data but linked to nothing.
52    pub unlinked: Vec<String>,
53    /// Found accounts with no profile data to correlate on.
54    pub without_profile: Vec<String>,
55}
56
57struct Node<'a> {
58    site: &'a str,
59    name: Option<String>,
60    name_tokens: BTreeSet<String>,
61    bio_tokens: BTreeSet<String>,
62}
63
64/// Correlate `Found` accounts by their enrichment fields.
65#[must_use]
66pub fn correlate(outcomes: &[CheckOutcome]) -> CorrelationReport {
67    let mut report = CorrelationReport::default();
68
69    let mut nodes: Vec<Node<'_>> = Vec::new();
70    for outcome in outcomes.iter().filter(|o| o.kind == MatchKind::Found) {
71        let name = outcome.enrichment.get("name");
72        let bio = outcome.enrichment.get("bio");
73        if name.is_none() && bio.is_none() {
74            report.without_profile.push(outcome.site.clone());
75            continue;
76        }
77        nodes.push(Node {
78            site: &outcome.site,
79            name: name.map(|n| normalize(n)),
80            name_tokens: name.map(|n| tokenize(n)).unwrap_or_default(),
81            bio_tokens: bio.map(|b| tokenize(b)).unwrap_or_default(),
82        });
83    }
84
85    let mut uf = UnionFind::new(nodes.len());
86    // Accumulate edge scores per root so we can average per cluster.
87    let mut edges: Vec<(usize, usize, f64)> = Vec::new();
88    for a in 0..nodes.len() {
89        for b in (a + 1)..nodes.len() {
90            let score = pair_score(&nodes[a], &nodes[b]);
91            if score >= LINK_THRESHOLD {
92                uf.union(a, b);
93                edges.push((a, b, score));
94            }
95        }
96    }
97
98    // Group node indices by union-find root.
99    let mut by_root: std::collections::HashMap<usize, Vec<usize>> =
100        std::collections::HashMap::new();
101    for i in 0..nodes.len() {
102        by_root.entry(uf.find(i)).or_default().push(i);
103    }
104
105    for (root, members) in by_root {
106        if members.len() < 2 {
107            // Singleton with profile data → unlinked.
108            for &i in &members {
109                report.unlinked.push(nodes[i].site.to_owned());
110            }
111            continue;
112        }
113        let scores: Vec<f64> = edges
114            .iter()
115            .filter(|(a, _, _)| uf_root_eq(&mut uf, *a, root))
116            .map(|(_, _, s)| *s)
117            .collect();
118        let confidence = if scores.is_empty() {
119            0.0
120        } else {
121            scores.iter().sum::<f64>() / scores.len() as f64
122        };
123        let mut member_sites: Vec<String> =
124            members.iter().map(|&i| nodes[i].site.to_owned()).collect();
125        member_sites.sort_unstable();
126        report.clusters.push(Cluster {
127            members: member_sites,
128            confidence,
129            shared_name: shared_name(&members, &nodes),
130        });
131    }
132
133    report.clusters.sort_by(|a, b| {
134        b.confidence
135            .partial_cmp(&a.confidence)
136            .unwrap_or(std::cmp::Ordering::Equal)
137            .then_with(|| a.members.cmp(&b.members))
138    });
139    report.unlinked.sort_unstable();
140    report.without_profile.sort_unstable();
141    report
142}
143
144fn uf_root_eq(uf: &mut UnionFind, node: usize, root: usize) -> bool {
145    uf.find(node) == root
146}
147
148fn shared_name(members: &[usize], nodes: &[Node<'_>]) -> Option<String> {
149    let first = nodes[members[0]].name.clone()?;
150    if first.is_empty() {
151        return None;
152    }
153    members
154        .iter()
155        .all(|&i| nodes[i].name.as_deref() == Some(first.as_str()))
156        .then_some(first)
157}
158
159fn pair_score(a: &Node<'_>, b: &Node<'_>) -> f64 {
160    let mut signals: Vec<f64> = Vec::new();
161    if a.name.is_some() && b.name.is_some() {
162        let name_sim = if a.name == b.name {
163            1.0
164        } else {
165            jaccard(&a.name_tokens, &b.name_tokens)
166        };
167        signals.push(name_sim);
168    }
169    if !a.bio_tokens.is_empty() && !b.bio_tokens.is_empty() {
170        signals.push(jaccard(&a.bio_tokens, &b.bio_tokens));
171    }
172    if signals.is_empty() {
173        0.0
174    } else {
175        signals.iter().sum::<f64>() / signals.len() as f64
176    }
177}
178
179fn normalize(s: &str) -> String {
180    s.split_whitespace()
181        .collect::<Vec<_>>()
182        .join(" ")
183        .to_lowercase()
184}
185
186fn tokenize(s: &str) -> BTreeSet<String> {
187    s.split(|c: char| !c.is_alphanumeric())
188        .filter(|t| t.chars().count() >= MIN_TOKEN_LEN)
189        .map(str::to_lowercase)
190        .collect()
191}
192
193fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f64 {
194    if a.is_empty() && b.is_empty() {
195        return 0.0;
196    }
197    let inter = a.intersection(b).count();
198    let union = a.union(b).count();
199    if union == 0 {
200        0.0
201    } else {
202        inter as f64 / union as f64
203    }
204}
205
206struct UnionFind {
207    parent: Vec<usize>,
208}
209
210impl UnionFind {
211    fn new(n: usize) -> Self {
212        Self {
213            parent: (0..n).collect(),
214        }
215    }
216
217    fn find(&mut self, x: usize) -> usize {
218        let mut root = x;
219        while self.parent[root] != root {
220            root = self.parent[root];
221        }
222        // Path compression.
223        let mut cur = x;
224        while self.parent[cur] != root {
225            let next = self.parent[cur];
226            self.parent[cur] = root;
227            cur = next;
228        }
229        root
230    }
231
232    fn union(&mut self, a: usize, b: usize) {
233        let (ra, rb) = (self.find(a), self.find(b));
234        if ra != rb {
235            self.parent[ra] = rb;
236        }
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243    use std::collections::BTreeMap;
244
245    fn found(site: &str, fields: &[(&str, &str)]) -> CheckOutcome {
246        let mut enrichment = BTreeMap::new();
247        for (k, v) in fields {
248            enrichment.insert((*k).to_owned(), (*v).to_owned());
249        }
250        CheckOutcome {
251            site: site.into(),
252            url: format!("https://{site}.example/u"),
253            kind: MatchKind::Found,
254            reason: None,
255            elapsed_ms: 1,
256            enrichment,
257            evidence: Vec::new(),
258        }
259    }
260
261    #[test]
262    fn links_accounts_with_matching_name_and_bio() {
263        let outcomes = vec![
264            found(
265                "GitHub",
266                &[("name", "Alice Liddell"), ("bio", "Rust systems hacker")],
267            ),
268            found(
269                "GitLab",
270                &[("name", "Alice Liddell"), ("bio", "systems hacker, Rust")],
271            ),
272        ];
273        let report = correlate(&outcomes);
274        assert_eq!(report.clusters.len(), 1);
275        let c = &report.clusters[0];
276        assert_eq!(c.members, ["GitHub", "GitLab"]);
277        assert!(c.confidence > 0.7, "confidence {}", c.confidence);
278        assert_eq!(c.shared_name.as_deref(), Some("alice liddell"));
279    }
280
281    #[test]
282    fn does_not_link_different_people_sharing_a_handle() {
283        let outcomes = vec![
284            found(
285                "GitHub",
286                &[("name", "Alice Liddell"), ("bio", "Rust systems hacker")],
287            ),
288            found(
289                "Twitch",
290                &[("name", "Bob Jones"), ("bio", "pro gamer and streamer")],
291            ),
292        ];
293        let report = correlate(&outcomes);
294        assert!(report.clusters.is_empty(), "should not link: {report:?}");
295        assert_eq!(report.unlinked.len(), 2);
296    }
297
298    #[test]
299    fn accounts_without_profile_are_separated() {
300        let outcomes = vec![
301            found("GitHub", &[("name", "Alice Liddell")]),
302            found("Vimeo", &[]),
303            found("HackerNews", &[]),
304        ];
305        let report = correlate(&outcomes);
306        assert!(report.clusters.is_empty());
307        assert_eq!(report.unlinked, ["GitHub"]);
308        assert_eq!(report.without_profile, ["HackerNews", "Vimeo"]);
309    }
310
311    #[test]
312    fn transitive_links_form_one_cluster() {
313        let outcomes = vec![
314            found("A", &[("bio", "loves rust and coffee")]),
315            found("B", &[("bio", "rust and coffee enthusiast")]),
316            found("C", &[("bio", "coffee and rust forever")]),
317        ];
318        let report = correlate(&outcomes);
319        assert_eq!(report.clusters.len(), 1);
320        assert_eq!(report.clusters[0].members.len(), 3);
321    }
322
323    #[test]
324    fn ignores_not_found_outcomes() {
325        let mut nf = found("GitLab", &[("name", "Alice Liddell")]);
326        nf.kind = MatchKind::NotFound;
327        let outcomes = vec![found("GitHub", &[("name", "Alice Liddell")]), nf];
328        let report = correlate(&outcomes);
329        // Only one Found-with-profile node → no cluster, one unlinked.
330        assert!(report.clusters.is_empty());
331        assert_eq!(report.unlinked, ["GitHub"]);
332    }
333
334    #[test]
335    fn jaccard_basics() {
336        let a = tokenize("rust and coffee");
337        let b = tokenize("rust and tea");
338        // tokens (len>=2): {rust, and, coffee} vs {rust, and, tea}
339        // inter {rust, and}=2, union 4 → 0.5
340        assert!((jaccard(&a, &b) - 0.5).abs() < 1e-9);
341    }
342}