use std::collections::HashMap;
use rusqlite::params;
use strsim::jaro_winkler;
use tracing::debug;
use crate::core::config::TeamConfig;
use crate::core::db::Database;
pub const DEFAULT_SIMILARITY_THRESHOLD: f64 = 0.85;
pub const NORMALIZED_SIMILARITY_THRESHOLD: f64 = 0.82;
fn normalize_for_fuzzy(s: &str) -> String {
s.to_lowercase()
.replace(['.', '-', '_'], " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn email_local_part(email: &str) -> String {
match email.find('@') {
Some(i) => email[..i].to_lowercase(),
None => email.to_lowercase(),
}
}
pub struct IdentityResolver {
aliases: HashMap<String, String>,
members: Vec<(String, String)>,
threshold: f64,
}
impl IdentityResolver {
pub fn new(team: Option<&TeamConfig>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
if let Some(team) = team {
for (k, v) in &team.aliases {
aliases.insert(k.to_lowercase(), v.clone());
}
for m in &team.members {
members.push((m.name.clone(), m.email.clone()));
for a in &m.aliases {
aliases.insert(a.to_lowercase(), m.name.clone());
}
aliases.insert(m.email.to_lowercase(), m.name.clone());
}
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
}
}
pub fn from_alias_map(map: &HashMap<String, Vec<String>>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
for (canon_name, alias_list) in map {
let canon_email = alias_list
.iter()
.find(|a| a.contains('@'))
.cloned()
.unwrap_or_default();
members.push((canon_name.clone(), canon_email.clone()));
aliases.insert(canon_name.to_lowercase(), canon_name.clone());
if !canon_email.is_empty() {
aliases.insert(canon_email.to_lowercase(), canon_name.clone());
}
for a in alias_list {
aliases.insert(a.to_lowercase(), canon_name.clone());
}
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
}
}
pub fn from_config(config: &crate::core::config::Config) -> Self {
let map = config.resolved_aliases();
if !map.is_empty() {
Self::from_alias_map(&map)
} else {
Self::new(config.team.as_ref())
}
}
pub fn with_threshold(mut self, threshold: f64) -> Self {
self.threshold = threshold;
self
}
pub fn add_alias(&mut self, alias: &str, canonical_name: &str) {
let alias = alias.trim();
let canonical = canonical_name.trim();
if alias.is_empty() || canonical.is_empty() {
return;
}
self.aliases
.insert(alias.to_lowercase(), canonical.to_string());
if self.find_member_by_name(canonical).is_none() {
let canonical_email = if alias.contains('@') {
alias.to_string()
} else {
String::new()
};
self.members.push((canonical.to_string(), canonical_email));
}
}
pub fn resolve(&self, name: &str, email: &str) -> (String, String) {
let email_lc = email.to_lowercase();
let name_lc = name.to_lowercase();
if let Some(canon_name) = self.aliases.get(&email_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
if let Some(canon_name) = self.aliases.get(&name_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
let mut best: Option<(f64, &(String, String))> = None;
for m in &self.members {
let s_name = jaro_winkler(&name_lc, &m.0.to_lowercase());
let s_email = jaro_winkler(&email_lc, &m.1.to_lowercase());
let score = s_name.max(s_email);
if score >= self.threshold && best.map(|(b, _)| score > b).unwrap_or(true) {
best = Some((score, m));
}
}
if let Some((score, m)) = best {
debug!(score, member = %m.0, "fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
let name_norm = normalize_for_fuzzy(name);
let local_norm = normalize_for_fuzzy(&email_local_part(email));
let mut best_norm: Option<(f64, &(String, String))> = None;
for m in &self.members {
let canon_name_norm = normalize_for_fuzzy(&m.0);
let canon_local_norm = normalize_for_fuzzy(&email_local_part(&m.1));
let candidates = [
jaro_winkler(&local_norm, &canon_name_norm),
jaro_winkler(&local_norm, &canon_local_norm),
jaro_winkler(&name_norm, &canon_name_norm),
jaro_winkler(&name_norm, &canon_local_norm),
];
let score = candidates.iter().cloned().fold(0.0_f64, f64::max);
if score >= NORMALIZED_SIMILARITY_THRESHOLD
&& best_norm.map(|(b, _)| score > b).unwrap_or(true)
{
best_norm = Some((score, m));
}
}
if let Some((score, m)) = best_norm {
debug!(score, member = %m.0, "normalized fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
(name.to_string(), email.to_string())
}
pub fn upsert_author(
&self,
db: &Database,
name: &str,
email: &str,
) -> crate::core::Result<i64> {
let (canon_name, canon_email) = self.resolve(name, email);
let conn = db.connection();
conn.execute(
"INSERT INTO authors (canonical_name, canonical_email, aliases) \
VALUES (?1, ?2, '[]') \
ON CONFLICT(canonical_email) DO UPDATE SET canonical_name = excluded.canonical_name",
params![canon_name, canon_email],
)?;
let id: i64 = conn.query_row(
"SELECT id FROM authors WHERE canonical_email = ?1",
params![canon_email],
|row| row.get(0),
)?;
Ok(id)
}
fn find_member_by_name(&self, name: &str) -> Option<(String, String)> {
self.members
.iter()
.find(|(n, _)| n.eq_ignore_ascii_case(name))
.cloned()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::config::{TeamConfig, TeamMember};
use std::collections::HashMap;
fn make_team() -> TeamConfig {
let mut aliases = HashMap::new();
aliases.insert("bobby".into(), "Bob Smith".into());
TeamConfig {
members: vec![TeamMember {
name: "Bob Smith".into(),
email: "bob@example.com".into(),
aliases: vec!["bsmith@example.com".into()],
}],
aliases,
}
}
#[test]
fn exact_email_alias_match() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("Whoever", "bsmith@example.com");
assert_eq!(n, "Bob Smith");
assert_eq!(e, "bob@example.com");
}
#[test]
fn exact_name_alias_match() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("bobby", "x@y.com");
assert_eq!(n, "Bob Smith");
assert_eq!(e, "bob@example.com");
}
#[test]
fn fuzzy_match_canonical_name() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, _e) = r.resolve("Bob Smyth", "unknown@elsewhere.com");
assert_eq!(n, "Bob Smith");
}
#[test]
fn no_match_returns_input() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("Zelda Q", "zelda@nowhere.test");
assert_eq!(n, "Zelda Q");
assert_eq!(e, "zelda@nowhere.test");
}
#[test]
fn empty_team_passthrough() {
let r = IdentityResolver::new(None);
let (n, e) = r.resolve("Anyone", "anyone@x.com");
assert_eq!(n, "Anyone");
assert_eq!(e, "anyone@x.com");
}
#[test]
fn all_aliases_registered() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Alice Smith".to_string(),
vec![
"alice@company.com".into(),
"alice.smith@personal.com".into(),
"asmith".into(), ],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("whoever", "alice@company.com");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n, e) = r.resolve("whoever", "alice.smith@personal.com");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n, e) = r.resolve("asmith", "noise@nowhere.test");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
}
#[test]
fn email_local_part_fuzzy_match() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Bob Matsuoka".to_string(),
vec!["bob.matsuoka@duettoresearch.com".into()],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Bob M", "bob.matsuoka@otherdomain.com");
assert_eq!(n, "Bob Matsuoka");
assert_eq!(e, "bob.matsuoka@duettoresearch.com");
}
#[test]
fn case_insensitive_email_lookup() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert("Alice Smith".to_string(), vec!["alice@company.com".into()]);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Whoever", "ALICE@COMPANY.COM");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n2, e2) = r.resolve("WhoEver", "Alice@Company.Com");
assert_eq!(n2, "Alice Smith");
assert_eq!(e2, "alice@company.com");
}
#[test]
fn short_name_fuzzy() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Bob Matsuoka".to_string(),
vec!["bob.matsuoka@co.com".into()],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, _e) = r.resolve("Bob M", "bobm@unknown.test");
assert_eq!(n, "Bob Matsuoka");
}
#[test]
fn unknown_author_passthrough() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert("Alice Smith".to_string(), vec!["alice@company.com".into()]);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Zelda Q", "zelda@nowhere.test");
assert_eq!(n, "Zelda Q");
assert_eq!(e, "zelda@nowhere.test");
}
#[test]
fn multiple_emails_same_person() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Andre Ramos".to_string(),
vec![
"andre.ramos@duettoresearch.com".into(),
"129991831+andreramosduetto@users.noreply.github.com".into(),
"andre@personal.dev".into(),
],
);
let r = IdentityResolver::from_alias_map(&map);
let (n1, e1) = r.resolve("Andre Ramos", "andre.ramos@duettoresearch.com");
let (n2, e2) = r.resolve(
"andreramosduetto",
"129991831+andreramosduetto@users.noreply.github.com",
);
let (n3, e3) = r.resolve("A. Ramos", "andre@personal.dev");
assert_eq!(n1, "Andre Ramos");
assert_eq!(n2, "Andre Ramos");
assert_eq!(n3, "Andre Ramos");
assert_eq!(e1, "andre.ramos@duettoresearch.com");
assert_eq!(e2, "andre.ramos@duettoresearch.com");
assert_eq!(e3, "andre.ramos@duettoresearch.com");
}
#[test]
fn duetto_contractors_config_resolves() {
let unique = format!(
"tga-duetto-contractors-{}-{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
);
let tmp = std::env::temp_dir().join(unique);
std::fs::create_dir_all(&tmp).expect("create tmp");
let aliases_yaml = r#"
developers:
- name: "Andre Ramos"
primary_email: "andre.ramos@duettoresearch.com"
aliases:
- "129991831+andreramosduetto@users.noreply.github.com"
- name: "Akash Arora"
primary_email: "akash.arora@duettoresearch.com"
aliases:
- "Akash.Arora-c@duettoresearch.com"
- "akash-duetto"
- name: "Janga Vinod Kumar Reddy"
primary_email: "janga.reddy@duettoresearch.com"
aliases:
- "jangareddy-duetto"
- "164324948+jangareddy-duetto@users.noreply.github.com"
"#;
let aliases_path = tmp.join("aliases.yaml");
std::fs::write(&aliases_path, aliases_yaml).expect("write aliases");
let config_yaml = format!(
"version: \"1.0\"\naliases_file: \"{}\"\n",
aliases_path.to_string_lossy()
);
let config_path = tmp.join("duetto-contractors.yaml");
std::fs::write(&config_path, config_yaml).expect("write config");
let cfg =
crate::core::config::Config::load(&config_path).expect("load duetto-contractors yaml");
let r = IdentityResolver::from_config(&cfg);
let (n, _) = r.resolve("whoever", "andre.ramos@duettoresearch.com");
assert_eq!(n, "Andre Ramos");
let (n, _) = r.resolve("whoever", "Akash.Arora-c@duettoresearch.com");
assert_eq!(n, "Akash Arora");
let (n, _) = r.resolve("jangareddy-duetto", "noise@nowhere.test");
assert_eq!(n, "Janga Vinod Kumar Reddy");
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn normalize_for_fuzzy_basic() {
assert_eq!(normalize_for_fuzzy("Bob.Matsuoka"), "bob matsuoka");
assert_eq!(normalize_for_fuzzy("alice_smith-c"), "alice smith c");
assert_eq!(normalize_for_fuzzy(" Foo Bar "), "foo bar");
}
#[test]
fn email_local_part_basic() {
assert_eq!(email_local_part("Bob@Example.COM"), "bob");
assert_eq!(email_local_part("no-at-symbol"), "no-at-symbol");
}
}