pub fn extract_entities(text: &str) -> Vec<(String, String)> {
let mut entities: Vec<(String, String)> = Vec::new();
let mut seen = std::collections::HashSet::new();
for word in text.split_whitespace() {
let trimmed = word.trim_matches(|c: char| c == ',' || c == '.' || c == ')' || c == '(' || c == ';');
if (trimmed.starts_with("http://") || trimmed.starts_with("https://")) && trimmed.len() > 10
&& seen.insert(("Url", trimmed.to_string())) {
entities.push(("Url".to_string(), trimmed.to_string()));
}
}
for word in text.split_whitespace() {
let trimmed = word.trim_matches(|c: char| c == ',' || c == '.' || c == ')' || c == '(' || c == ';' || c == '<' || c == '>');
if is_email_like(trimmed)
&& seen.insert(("Email", trimmed.to_string())) {
entities.push(("Email".to_string(), trimmed.to_string()));
}
}
for word in text.split_whitespace() {
let trimmed = word.trim_matches(|c: char| c == ',' || c == '.' || c == ')' || c == '(' || c == ';');
if trimmed.starts_with('@') && trimmed.len() > 1 {
let handle = trimmed.to_string();
if seen.insert(("Mention", handle.clone())) {
entities.push(("Mention".to_string(), handle));
}
}
}
let cap_phrases = extract_capitalized_phrases(text);
for phrase in cap_phrases {
if seen.insert(("Person", phrase.clone())) {
entities.push(("Person".to_string(), phrase));
}
}
entities
}
fn is_email_like(s: &str) -> bool {
if let Some(at_pos) = s.find('@') {
let local = &s[..at_pos];
let domain = &s[at_pos + 1..];
!local.is_empty()
&& !domain.is_empty()
&& domain.contains('.')
&& !domain.starts_with('.')
&& !domain.ends_with('.')
&& local.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '_' || c == '-' || c == '+')
&& domain.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '-')
} else {
false
}
}
fn extract_capitalized_phrases(text: &str) -> Vec<String> {
let mut phrases = Vec::new();
let words: Vec<&str> = text.split_whitespace().collect();
let mut i = 0;
while i < words.len() {
let word = words[i].trim_matches(|c: char| !c.is_alphanumeric());
if is_capitalized(word) && word.len() > 1 {
let start = i;
let mut parts = vec![word.to_string()];
i += 1;
while i < words.len() {
let next = words[i].trim_matches(|c: char| !c.is_alphanumeric());
if is_capitalized(next) && next.len() > 1 {
parts.push(next.to_string());
i += 1;
} else {
break;
}
}
if parts.len() >= 2 {
let is_sentence_start = start == 0
|| words.get(start.wrapping_sub(1)).is_some_and(|prev| {
prev.ends_with('.') || prev.ends_with('!') || prev.ends_with('?')
});
if is_sentence_start && parts.len() == 2 && is_common_starter(&parts[0]) {
} else {
let phrase = parts.join(" ");
phrases.push(phrase);
}
}
} else {
i += 1;
}
}
phrases
}
fn is_capitalized(word: &str) -> bool {
word.chars()
.next()
.is_some_and(|c| c.is_uppercase())
}
fn is_common_starter(word: &str) -> bool {
matches!(
word.to_lowercase().as_str(),
"the" | "a" | "an" | "this" | "that" | "these" | "those" | "it" | "i" | "we" | "they" | "he" | "she"
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_urls() {
let entities = extract_entities("Visit https://example.com/page and http://foo.bar/baz for info.");
let urls: Vec<_> = entities.iter().filter(|(l, _)| l == "Url").collect();
assert_eq!(urls.len(), 2);
assert_eq!(urls[0].1, "https://example.com/page");
assert_eq!(urls[1].1, "http://foo.bar/baz");
}
#[test]
fn test_extract_emails() {
let entities = extract_entities("Email alice@example.com or bob@company.org for help.");
let emails: Vec<_> = entities.iter().filter(|(l, _)| l == "Email").collect();
assert_eq!(emails.len(), 2);
}
#[test]
fn test_extract_mentions() {
let entities = extract_entities("Hey @alice and @bob-dev, check this out.");
let mentions: Vec<_> = entities.iter().filter(|(l, _)| l == "Mention").collect();
assert_eq!(mentions.len(), 2);
assert_eq!(mentions[0].1, "@alice");
assert_eq!(mentions[1].1, "@bob-dev");
}
#[test]
fn test_extract_capitalized_phrases() {
let entities = extract_entities("I met John Smith at the World Trade Center yesterday.");
let persons: Vec<_> = entities.iter().filter(|(l, _)| l == "Person").collect();
assert!(persons.iter().any(|(_, n)| n == "John Smith"));
assert!(persons.iter().any(|(_, n)| n == "World Trade Center"));
}
#[test]
fn test_no_false_positives_on_sentence_start() {
let entities = extract_entities("The cat sat on the mat.");
let persons: Vec<_> = entities.iter().filter(|(l, _)| l == "Person").collect();
assert!(persons.is_empty());
}
#[test]
fn test_deduplication() {
let entities = extract_entities("Visit https://example.com and https://example.com again.");
let urls: Vec<_> = entities.iter().filter(|(l, _)| l == "Url").collect();
assert_eq!(urls.len(), 1);
}
}