use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;
use std::collections::BTreeSet;
#[derive(Serialize)]
pub struct Entities {
pub people: Vec<Person>,
pub organizations: Vec<String>,
}
#[derive(Serialize)]
pub struct Person {
pub name: String,
pub credentials: Option<String>,
}
static TITLED_NAME: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"\b(?:Dr\.?|Prof\.?|Professor|Mr|Mrs|Ms|Sir|Dame)\s+([A-Z][a-zA-Z\-']+(?:\s+[A-Z][a-zA-Z\-']+){1,3})")
.unwrap()
});
static NAME_WITH_CRED: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"\b([A-Z][a-zA-Z\-']+(?:\s+[A-Z][a-zA-Z\-']+){1,3})\s*,?\s*(MD|Ph\.?D\.?|MBA|MSc|MPH|DDS|DMD|JD|RN|DO|DPM|OD|PharmD|DVM|EdD|PsyD)\b",
)
.unwrap()
});
static ORG_LEGAL: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"\b([A-Z][A-Za-z0-9&\-]+(?:\s+[A-Z][A-Za-z0-9&\-]+){0,4})\s+(Inc\.?|LLC|Ltd\.?|Limited|Co\.|Corp\.?|Corporation|Foundation|Institute|University|College|Society|Group|Holdings|Hospital|Clinic|Laboratory|Labs|Research)\b",
)
.unwrap()
});
const HONORIFIC_PREFIXES: &[&str] =
&["Per", "By", "With", "From", "As", "Like", "And", "The", "Dr", "Prof", "Mr", "Mrs", "Ms"];
pub fn extract(body_text: &str) -> Entities {
let mut people: Vec<Person> = Vec::new();
let mut seen_names: BTreeSet<String> = BTreeSet::new();
for cap in NAME_WITH_CRED.captures_iter(body_text) {
let name = strip_honorific(&cap[1]);
let cred = cap[2].to_string();
if name.split_whitespace().count() < 2 {
continue; }
if seen_names.insert(name.clone()) {
people.push(Person {
name,
credentials: Some(normalise_cred(&cred)),
});
}
}
for cap in TITLED_NAME.captures_iter(body_text) {
let name = strip_honorific(&cap[1]);
if name.split_whitespace().count() < 2 {
continue;
}
if seen_names.insert(name.clone()) {
people.push(Person {
name,
credentials: None,
});
}
}
let mut orgs: BTreeSet<String> = BTreeSet::new();
for cap in ORG_LEGAL.captures_iter(body_text) {
orgs.insert(format!("{} {}", &cap[1], &cap[2]));
}
let mut organizations: Vec<String> = orgs.into_iter().collect();
organizations.sort();
organizations.truncate(20);
Entities {
people,
organizations,
}
}
fn strip_honorific(name: &str) -> String {
let mut tokens: Vec<&str> = name.split_whitespace().collect();
while let Some(first) = tokens.first() {
if HONORIFIC_PREFIXES.iter().any(|h| h.eq_ignore_ascii_case(first)) {
tokens.remove(0);
} else {
break;
}
}
tokens.join(" ")
}
fn normalise_cred(c: &str) -> String {
c.replace('.', "").to_uppercase()
}