use std::collections::HashMap;
use rusqlite::params;
use strsim::jaro_winkler;
use tracing::debug;
use crate::core::config::TeamConfig;
use crate::core::db::Database;
pub const DEFAULT_SIMILARITY_THRESHOLD: f64 = 0.85;
pub const NORMALIZED_SIMILARITY_THRESHOLD: f64 = 0.82;
fn normalize_for_fuzzy(s: &str) -> String {
s.to_lowercase()
.replace(['.', '-', '_'], " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn email_local_part(email: &str) -> String {
match email.find('@') {
Some(i) => email[..i].to_lowercase(),
None => email.to_lowercase(),
}
}
pub fn email_domain_matches(email: &str, domain: &str) -> bool {
let needle = domain.trim().trim_start_matches('@').to_lowercase();
if needle.is_empty() {
return false;
}
match email.rfind('@') {
Some(i) => email[i + 1..].to_lowercase() == needle,
None => false,
}
}
pub struct IdentityResolver {
aliases: HashMap<String, String>,
members: Vec<(String, String)>,
threshold: f64,
canonical_domain: Option<String>,
}
impl IdentityResolver {
pub fn new(team: Option<&TeamConfig>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
let mut canonical_domain: Option<String> = None;
if let Some(team) = team {
for (k, v) in &team.aliases {
aliases.insert(k.to_lowercase(), v.clone());
}
for m in &team.members {
members.push((m.name.clone(), m.email.clone()));
for a in &m.aliases {
aliases.insert(a.to_lowercase(), m.name.clone());
}
aliases.insert(m.email.to_lowercase(), m.name.clone());
}
canonical_domain = team
.canonical_domain
.as_ref()
.map(|d| d.trim().trim_start_matches('@').to_lowercase())
.filter(|d| !d.is_empty());
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
canonical_domain,
}
}
pub fn from_alias_map(map: &HashMap<String, Vec<String>>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
for (canon_name, alias_list) in map {
let canon_email = alias_list
.iter()
.find(|a| a.contains('@'))
.cloned()
.unwrap_or_default();
members.push((canon_name.clone(), canon_email.clone()));
aliases.insert(canon_name.to_lowercase(), canon_name.clone());
if !canon_email.is_empty() {
aliases.insert(canon_email.to_lowercase(), canon_name.clone());
}
for a in alias_list {
aliases.insert(a.to_lowercase(), canon_name.clone());
}
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
canonical_domain: None,
}
}
pub fn from_config(config: &crate::core::config::Config) -> Self {
let map = config.resolved_aliases();
let mut resolver = if !map.is_empty() {
Self::from_alias_map(&map)
} else {
Self::new(config.team.as_ref())
};
if resolver.canonical_domain.is_none() {
if let Some(team) = config.team.as_ref() {
resolver.canonical_domain = team
.canonical_domain
.as_ref()
.map(|d| d.trim().trim_start_matches('@').to_lowercase())
.filter(|d| !d.is_empty());
}
}
resolver
}
pub fn with_threshold(mut self, threshold: f64) -> Self {
self.threshold = threshold;
self
}
pub fn add_alias(&mut self, alias: &str, canonical_name: &str) {
let alias = alias.trim();
let canonical = canonical_name.trim();
if alias.is_empty() || canonical.is_empty() {
return;
}
self.aliases
.insert(alias.to_lowercase(), canonical.to_string());
if self.find_member_by_name(canonical).is_none() {
let canonical_email = if alias.contains('@') {
alias.to_string()
} else {
String::new()
};
self.members.push((canonical.to_string(), canonical_email));
}
}
pub fn resolve(&self, name: &str, email: &str) -> (String, String) {
let email_lc = email.to_lowercase();
let name_lc = name.to_lowercase();
if let Some(canon_name) = self.aliases.get(&email_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
if let Some(canon_name) = self.aliases.get(&name_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
let mut best: Option<(f64, &(String, String))> = None;
for m in &self.members {
let s_name = jaro_winkler(&name_lc, &m.0.to_lowercase());
let s_email = jaro_winkler(&email_lc, &m.1.to_lowercase());
let score = s_name.max(s_email);
if score >= self.threshold && best.map(|(b, _)| score > b).unwrap_or(true) {
best = Some((score, m));
}
}
if let Some((score, m)) = best {
debug!(score, member = %m.0, "fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
let name_norm = normalize_for_fuzzy(name);
let local_norm = normalize_for_fuzzy(&email_local_part(email));
let mut best_norm: Option<(f64, &(String, String))> = None;
for m in &self.members {
let canon_name_norm = normalize_for_fuzzy(&m.0);
let canon_local_norm = normalize_for_fuzzy(&email_local_part(&m.1));
let candidates = [
jaro_winkler(&local_norm, &canon_name_norm),
jaro_winkler(&local_norm, &canon_local_norm),
jaro_winkler(&name_norm, &canon_name_norm),
jaro_winkler(&name_norm, &canon_local_norm),
];
let score = candidates.iter().cloned().fold(0.0_f64, f64::max);
if score >= NORMALIZED_SIMILARITY_THRESHOLD
&& best_norm.map(|(b, _)| score > b).unwrap_or(true)
{
best_norm = Some((score, m));
}
}
if let Some((score, m)) = best_norm {
debug!(score, member = %m.0, "normalized fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
(name.to_string(), email.to_string())
}
pub fn upsert_author(
&self,
db: &Database,
name: &str,
email: &str,
) -> crate::core::Result<i64> {
let (canon_name, mut canon_email) = self.resolve(name, email);
let conn = db.connection();
if let Some(domain) = &self.canonical_domain {
if !email_domain_matches(&canon_email, domain) {
let alt: Option<String> = conn
.query_row(
"SELECT canonical_email FROM authors \
WHERE LOWER(canonical_name) = LOWER(?1) \
AND LOWER(SUBSTR(canonical_email, INSTR(canonical_email, '@') + 1)) = ?2 \
LIMIT 1",
params![canon_name, domain],
|row| row.get::<_, String>(0),
)
.ok();
if let Some(found) = alt {
debug!(
prior_email = %canon_email,
chosen_email = %found,
domain = %domain,
"canonical_domain policy routed commit to existing org-domain identity"
);
canon_email = found;
}
}
}
conn.execute(
"INSERT INTO authors (canonical_name, canonical_email, aliases) \
VALUES (?1, ?2, '[]') \
ON CONFLICT(canonical_email) DO UPDATE SET canonical_name = excluded.canonical_name",
params![canon_name, canon_email],
)?;
let id: i64 = conn.query_row(
"SELECT id FROM authors WHERE canonical_email = ?1",
params![canon_email],
|row| row.get(0),
)?;
Ok(id)
}
pub fn canonical_domain(&self) -> Option<&str> {
self.canonical_domain.as_deref()
}
fn find_member_by_name(&self, name: &str) -> Option<(String, String)> {
self.members
.iter()
.find(|(n, _)| n.eq_ignore_ascii_case(name))
.cloned()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::config::{TeamConfig, TeamMember};
use std::collections::HashMap;
fn make_team() -> TeamConfig {
let mut aliases = HashMap::new();
aliases.insert("bobby".into(), "Bob Smith".into());
TeamConfig {
members: vec![TeamMember {
name: "Bob Smith".into(),
email: "bob@example.com".into(),
aliases: vec!["bsmith@example.com".into()],
}],
aliases,
canonical_domain: None,
}
}
#[test]
fn exact_email_alias_match() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("Whoever", "bsmith@example.com");
assert_eq!(n, "Bob Smith");
assert_eq!(e, "bob@example.com");
}
#[test]
fn exact_name_alias_match() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("bobby", "x@y.com");
assert_eq!(n, "Bob Smith");
assert_eq!(e, "bob@example.com");
}
#[test]
fn fuzzy_match_canonical_name() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, _e) = r.resolve("Bob Smyth", "unknown@elsewhere.com");
assert_eq!(n, "Bob Smith");
}
#[test]
fn no_match_returns_input() {
let r = IdentityResolver::new(Some(&make_team()));
let (n, e) = r.resolve("Zelda Q", "zelda@nowhere.test");
assert_eq!(n, "Zelda Q");
assert_eq!(e, "zelda@nowhere.test");
}
#[test]
fn empty_team_passthrough() {
let r = IdentityResolver::new(None);
let (n, e) = r.resolve("Anyone", "anyone@x.com");
assert_eq!(n, "Anyone");
assert_eq!(e, "anyone@x.com");
}
#[test]
fn all_aliases_registered() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Alice Smith".to_string(),
vec![
"alice@company.com".into(),
"alice.smith@personal.com".into(),
"asmith".into(), ],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("whoever", "alice@company.com");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n, e) = r.resolve("whoever", "alice.smith@personal.com");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n, e) = r.resolve("asmith", "noise@nowhere.test");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
}
#[test]
fn email_local_part_fuzzy_match() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Bob Matsuoka".to_string(),
vec!["bob.matsuoka@duettoresearch.com".into()],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Bob M", "bob.matsuoka@otherdomain.com");
assert_eq!(n, "Bob Matsuoka");
assert_eq!(e, "bob.matsuoka@duettoresearch.com");
}
#[test]
fn case_insensitive_email_lookup() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert("Alice Smith".to_string(), vec!["alice@company.com".into()]);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Whoever", "ALICE@COMPANY.COM");
assert_eq!(n, "Alice Smith");
assert_eq!(e, "alice@company.com");
let (n2, e2) = r.resolve("WhoEver", "Alice@Company.Com");
assert_eq!(n2, "Alice Smith");
assert_eq!(e2, "alice@company.com");
}
#[test]
fn short_name_fuzzy() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Bob Matsuoka".to_string(),
vec!["bob.matsuoka@co.com".into()],
);
let r = IdentityResolver::from_alias_map(&map);
let (n, _e) = r.resolve("Bob M", "bobm@unknown.test");
assert_eq!(n, "Bob Matsuoka");
}
#[test]
fn unknown_author_passthrough() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert("Alice Smith".to_string(), vec!["alice@company.com".into()]);
let r = IdentityResolver::from_alias_map(&map);
let (n, e) = r.resolve("Zelda Q", "zelda@nowhere.test");
assert_eq!(n, "Zelda Q");
assert_eq!(e, "zelda@nowhere.test");
}
#[test]
fn multiple_emails_same_person() {
let mut map: HashMap<String, Vec<String>> = HashMap::new();
map.insert(
"Andre Ramos".to_string(),
vec![
"andre.ramos@duettoresearch.com".into(),
"129991831+andreramosduetto@users.noreply.github.com".into(),
"andre@personal.dev".into(),
],
);
let r = IdentityResolver::from_alias_map(&map);
let (n1, e1) = r.resolve("Andre Ramos", "andre.ramos@duettoresearch.com");
let (n2, e2) = r.resolve(
"andreramosduetto",
"129991831+andreramosduetto@users.noreply.github.com",
);
let (n3, e3) = r.resolve("A. Ramos", "andre@personal.dev");
assert_eq!(n1, "Andre Ramos");
assert_eq!(n2, "Andre Ramos");
assert_eq!(n3, "Andre Ramos");
assert_eq!(e1, "andre.ramos@duettoresearch.com");
assert_eq!(e2, "andre.ramos@duettoresearch.com");
assert_eq!(e3, "andre.ramos@duettoresearch.com");
}
#[test]
fn duetto_contractors_config_resolves() {
let unique = format!(
"tga-duetto-contractors-{}-{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
);
let tmp = std::env::temp_dir().join(unique);
std::fs::create_dir_all(&tmp).expect("create tmp");
let aliases_yaml = r#"
developers:
- name: "Andre Ramos"
primary_email: "andre.ramos@duettoresearch.com"
aliases:
- "129991831+andreramosduetto@users.noreply.github.com"
- name: "Akash Arora"
primary_email: "akash.arora@duettoresearch.com"
aliases:
- "Akash.Arora-c@duettoresearch.com"
- "akash-duetto"
- name: "Janga Vinod Kumar Reddy"
primary_email: "janga.reddy@duettoresearch.com"
aliases:
- "jangareddy-duetto"
- "164324948+jangareddy-duetto@users.noreply.github.com"
"#;
let aliases_path = tmp.join("aliases.yaml");
std::fs::write(&aliases_path, aliases_yaml).expect("write aliases");
let config_yaml = format!(
"version: \"1.0\"\naliases_file: \"{}\"\n",
aliases_path.to_string_lossy()
);
let config_path = tmp.join("duetto-contractors.yaml");
std::fs::write(&config_path, config_yaml).expect("write config");
let cfg =
crate::core::config::Config::load(&config_path).expect("load duetto-contractors yaml");
let r = IdentityResolver::from_config(&cfg);
let (n, _) = r.resolve("whoever", "andre.ramos@duettoresearch.com");
assert_eq!(n, "Andre Ramos");
let (n, _) = r.resolve("whoever", "Akash.Arora-c@duettoresearch.com");
assert_eq!(n, "Akash Arora");
let (n, _) = r.resolve("jangareddy-duetto", "noise@nowhere.test");
assert_eq!(n, "Janga Vinod Kumar Reddy");
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn normalize_for_fuzzy_basic() {
assert_eq!(normalize_for_fuzzy("Bob.Matsuoka"), "bob matsuoka");
assert_eq!(normalize_for_fuzzy("alice_smith-c"), "alice smith c");
assert_eq!(normalize_for_fuzzy(" Foo Bar "), "foo bar");
}
#[test]
fn email_local_part_basic() {
assert_eq!(email_local_part("Bob@Example.COM"), "bob");
assert_eq!(email_local_part("no-at-symbol"), "no-at-symbol");
}
#[test]
fn email_domain_matches_basic() {
assert!(email_domain_matches(
"a@DUETTORESEARCH.COM",
"duettoresearch.com"
));
assert!(email_domain_matches(
"a@duettoresearch.com",
"@duettoresearch.com"
));
assert!(!email_domain_matches("a@other.com", "duettoresearch.com"));
assert!(!email_domain_matches("invalid-email", "duettoresearch.com"));
assert!(!email_domain_matches("a@duettoresearch.com", ""));
}
#[test]
fn canonical_domain_prefers_org_email_for_team_member() {
let team = TeamConfig {
members: vec![TeamMember {
name: "Alice Org".into(),
email: "alice@duettoresearch.com".into(),
aliases: vec!["alice@personal.com".into()],
}],
aliases: HashMap::new(),
canonical_domain: Some("duettoresearch.com".into()),
};
let r = IdentityResolver::new(Some(&team));
let (_, e) = r.resolve("Alice Org", "alice@personal.com");
assert_eq!(e, "alice@duettoresearch.com");
assert_eq!(r.canonical_domain(), Some("duettoresearch.com"));
}
#[test]
fn canonical_domain_routes_new_personal_email_to_existing_org_row() {
let team = TeamConfig {
members: vec![],
aliases: HashMap::new(),
canonical_domain: Some("duettoresearch.com".into()),
};
let r = IdentityResolver::new(Some(&team));
let db = Database::open_in_memory().expect("db");
let _ = r
.upsert_author(&db, "Bob Matsuoka", "bob@duettoresearch.com")
.expect("seed");
let id = r
.upsert_author(&db, "Bob Matsuoka", "bob@personal.com")
.expect("upsert");
let stored_email: String = db
.connection()
.query_row(
"SELECT canonical_email FROM authors WHERE id = ?1",
params![id],
|row| row.get(0),
)
.expect("lookup");
assert_eq!(stored_email, "bob@duettoresearch.com");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM authors WHERE canonical_name = 'Bob Matsuoka'",
[],
|row| row.get(0),
)
.expect("count");
assert_eq!(count, 1);
}
#[test]
fn canonical_domain_absent_falls_back_to_first_seen_email() {
let r = IdentityResolver::new(None);
assert_eq!(r.canonical_domain(), None);
let db = Database::open_in_memory().expect("db");
let _ = r
.upsert_author(&db, "Carol", "carol@personal.com")
.expect("seed");
let _ = r
.upsert_author(&db, "Carol", "carol@work.com")
.expect("upsert");
let count: i64 = db
.connection()
.query_row(
"SELECT COUNT(*) FROM authors WHERE canonical_name = 'Carol'",
[],
|row| row.get(0),
)
.expect("count");
assert_eq!(count, 2);
}
#[test]
fn canonical_domain_read_from_config() {
let yaml = r#"
team:
canonical_domain: "duettoresearch.com"
members:
- name: "Alice"
email: "alice@duettoresearch.com"
"#;
let cfg: crate::core::config::Config = serde_yaml::from_str(yaml).expect("parse");
let r = IdentityResolver::from_config(&cfg);
assert_eq!(r.canonical_domain(), Some("duettoresearch.com"));
}
}