use std::collections::HashMap;
use rusqlite::params;
use strsim::jaro_winkler;
use tracing::debug;
use crate::core::config::TeamConfig;
use crate::core::db::Database;
pub const DEFAULT_SIMILARITY_THRESHOLD: f64 = 0.85;
pub const NORMALIZED_SIMILARITY_THRESHOLD: f64 = 0.82;
fn normalize_for_fuzzy(s: &str) -> String {
s.to_lowercase()
.replace(['.', '-', '_'], " ")
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
fn email_local_part(email: &str) -> String {
match email.find('@') {
Some(i) => email[..i].to_lowercase(),
None => email.to_lowercase(),
}
}
pub fn email_domain_matches(email: &str, domain: &str) -> bool {
let needle = domain.trim().trim_start_matches('@').to_lowercase();
if needle.is_empty() {
return false;
}
match email.rfind('@') {
Some(i) => email[i + 1..].to_lowercase() == needle,
None => false,
}
}
pub struct IdentityResolver {
aliases: HashMap<String, String>,
members: Vec<(String, String)>,
threshold: f64,
canonical_domain: Option<String>,
}
impl IdentityResolver {
pub fn new(team: Option<&TeamConfig>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
let mut canonical_domain: Option<String> = None;
if let Some(team) = team {
for (k, v) in &team.aliases {
aliases.insert(k.to_lowercase(), v.clone());
}
for m in &team.members {
members.push((m.name.clone(), m.email.clone()));
for a in &m.aliases {
aliases.insert(a.to_lowercase(), m.name.clone());
}
aliases.insert(m.email.to_lowercase(), m.name.clone());
}
canonical_domain = team
.canonical_domain
.as_ref()
.map(|d| d.trim().trim_start_matches('@').to_lowercase())
.filter(|d| !d.is_empty());
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
canonical_domain,
}
}
pub fn from_alias_map(map: &HashMap<String, Vec<String>>) -> Self {
let mut aliases: HashMap<String, String> = HashMap::new();
let mut members: Vec<(String, String)> = Vec::new();
for (canon_name, alias_list) in map {
let canon_email = alias_list
.iter()
.find(|a| a.contains('@'))
.cloned()
.unwrap_or_default();
members.push((canon_name.clone(), canon_email.clone()));
aliases.insert(canon_name.to_lowercase(), canon_name.clone());
if !canon_email.is_empty() {
aliases.insert(canon_email.to_lowercase(), canon_name.clone());
}
for a in alias_list {
aliases.insert(a.to_lowercase(), canon_name.clone());
}
}
Self {
aliases,
members,
threshold: DEFAULT_SIMILARITY_THRESHOLD,
canonical_domain: None,
}
}
pub fn from_config(config: &crate::core::config::Config) -> Self {
let map = config.resolved_aliases();
let mut resolver = if !map.is_empty() {
Self::from_alias_map(&map)
} else {
Self::new(config.team.as_ref())
};
if resolver.canonical_domain.is_none() {
if let Some(team) = config.team.as_ref() {
resolver.canonical_domain = team
.canonical_domain
.as_ref()
.map(|d| d.trim().trim_start_matches('@').to_lowercase())
.filter(|d| !d.is_empty());
}
}
resolver
}
pub fn with_threshold(mut self, threshold: f64) -> Self {
self.threshold = threshold;
self
}
pub fn add_alias(&mut self, alias: &str, canonical_name: &str) {
let alias = alias.trim();
let canonical = canonical_name.trim();
if alias.is_empty() || canonical.is_empty() {
return;
}
self.aliases
.insert(alias.to_lowercase(), canonical.to_string());
if self.find_member_by_name(canonical).is_none() {
let canonical_email = if alias.contains('@') {
alias.to_string()
} else {
String::new()
};
self.members.push((canonical.to_string(), canonical_email));
}
}
pub fn resolve(&self, name: &str, email: &str) -> (String, String) {
let email_lc = email.to_lowercase();
let name_lc = name.to_lowercase();
if let Some(canon_name) = self.aliases.get(&email_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
if let Some(canon_name) = self.aliases.get(&name_lc) {
if let Some((cn, ce)) = self.find_member_by_name(canon_name) {
return (cn, ce);
}
}
let mut best: Option<(f64, &(String, String))> = None;
for m in &self.members {
let s_name = jaro_winkler(&name_lc, &m.0.to_lowercase());
let s_email = jaro_winkler(&email_lc, &m.1.to_lowercase());
let score = s_name.max(s_email);
if score >= self.threshold && best.map(|(b, _)| score > b).unwrap_or(true) {
best = Some((score, m));
}
}
if let Some((score, m)) = best {
debug!(score, member = %m.0, "fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
let name_norm = normalize_for_fuzzy(name);
let local_norm = normalize_for_fuzzy(&email_local_part(email));
let mut best_norm: Option<(f64, &(String, String))> = None;
for m in &self.members {
let canon_name_norm = normalize_for_fuzzy(&m.0);
let canon_local_norm = normalize_for_fuzzy(&email_local_part(&m.1));
let candidates = [
jaro_winkler(&local_norm, &canon_name_norm),
jaro_winkler(&local_norm, &canon_local_norm),
jaro_winkler(&name_norm, &canon_name_norm),
jaro_winkler(&name_norm, &canon_local_norm),
];
let score = candidates.iter().cloned().fold(0.0_f64, f64::max);
if score >= NORMALIZED_SIMILARITY_THRESHOLD
&& best_norm.map(|(b, _)| score > b).unwrap_or(true)
{
best_norm = Some((score, m));
}
}
if let Some((score, m)) = best_norm {
debug!(score, member = %m.0, "normalized fuzzy identity match");
return (m.0.clone(), m.1.clone());
}
(name.to_string(), email.to_string())
}
pub fn upsert_author(
&self,
db: &Database,
name: &str,
email: &str,
) -> crate::core::Result<i64> {
let (canon_name, mut canon_email) = self.resolve(name, email);
let conn = db.connection();
if let Some(domain) = &self.canonical_domain {
if !email_domain_matches(&canon_email, domain) {
let alt: Option<String> = conn
.query_row(
"SELECT canonical_email FROM authors \
WHERE LOWER(canonical_name) = LOWER(?1) \
AND LOWER(SUBSTR(canonical_email, INSTR(canonical_email, '@') + 1)) = ?2 \
LIMIT 1",
params![canon_name, domain],
|row| row.get::<_, String>(0),
)
.ok();
if let Some(found) = alt {
debug!(
prior_email = %canon_email,
chosen_email = %found,
domain = %domain,
"canonical_domain policy routed commit to existing org-domain identity"
);
canon_email = found;
}
}
}
conn.execute(
"INSERT INTO authors (canonical_name, canonical_email, aliases) \
VALUES (?1, ?2, '[]') \
ON CONFLICT(canonical_email) DO UPDATE SET canonical_name = excluded.canonical_name",
params![canon_name, canon_email],
)?;
let id: i64 = conn.query_row(
"SELECT id FROM authors WHERE canonical_email = ?1",
params![canon_email],
|row| row.get(0),
)?;
Ok(id)
}
pub fn canonical_domain(&self) -> Option<&str> {
self.canonical_domain.as_deref()
}
fn find_member_by_name(&self, name: &str) -> Option<(String, String)> {
self.members
.iter()
.find(|(n, _)| n.eq_ignore_ascii_case(name))
.cloned()
}
}
#[cfg(test)]
#[path = "resolver_tests.rs"]
mod tests;