use super::*;
#[test]
fn test_basic_person_detection() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Dr. John Smith met with Mary.", None)
.unwrap();
let names: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
names
.iter()
.any(|n| n.contains("John") || n.contains("Smith")),
"Should detect John Smith: {:?}",
names
);
}
#[test]
fn test_organization_suffix_detection() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Apple Inc. announced new products.", None)
.unwrap();
let orgs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Organization))
.collect();
assert!(!orgs.is_empty(), "Should detect Apple Inc. as organization");
}
#[test]
fn test_location_preposition_context() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She lived in Paris for years.", None)
.unwrap();
let locs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Location))
.collect();
assert!(!locs.is_empty(), "Should detect Paris as location");
}
#[test]
fn test_known_organizations() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Google and Microsoft competed.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
texts.iter().any(|t| t.contains("Google")),
"Should detect Google"
);
assert!(
texts.iter().any(|t| t.contains("Microsoft")),
"Should detect Microsoft"
);
}
#[test]
fn test_cjk_organization_detection() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("ソニーが新製品を発表しました。", None)
.unwrap();
let orgs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Organization))
.collect();
assert!(
!orgs.is_empty(),
"Should detect Sony (ソニー) as organization"
);
}
#[test]
fn test_cjk_location_detection() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("東京オリンピックが開催された。", None)
.unwrap();
let locs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Location))
.collect();
assert!(!locs.is_empty(), "Should detect Tokyo (東京) as location");
}
#[test]
fn test_empty_text() {
let ner = HeuristicNER::new();
let entities = ner.extract_entities("", None).unwrap();
assert!(entities.is_empty());
}
#[test]
fn test_no_entities() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("the quick brown fox jumps over the lazy dog", None)
.unwrap();
assert!(
entities.is_empty(),
"Lowercase text should have no entities"
);
}
#[test]
fn test_threshold_filtering() {
let low_threshold = HeuristicNER::with_threshold(0.1);
let high_threshold = HeuristicNER::with_threshold(0.9);
let text = "John works at Google.";
let low_entities = low_threshold.extract_entities(text, None).unwrap();
let high_entities = high_threshold.extract_entities(text, None).unwrap();
assert!(low_entities.len() >= high_entities.len());
}
#[test]
fn test_sentence_starter_filtering() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("The dog ran. It was fast.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(&"The"),
"Should filter 'The' as sentence starter"
);
assert!(!texts.contains(&"It"), "Should filter 'It' as pronoun");
}
#[test]
fn test_person_prefix_detection() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Prof. Einstein presented the theory.", None)
.unwrap();
let persons: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Person))
.collect();
assert!(
!persons.is_empty(),
"Should detect Prof. Einstein as person"
);
}
#[test]
fn test_multi_word_organization() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Bank of America provides services.", None)
.unwrap();
let orgs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Organization))
.collect();
assert!(!orgs.is_empty(), "Should detect 'Bank of America' pattern");
}
#[test]
fn test_location_indicators() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("New Zealand is beautiful.", None)
.unwrap();
let locs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Location))
.collect();
assert!(!locs.is_empty(), "Should detect 'New Zealand' as location");
}
#[test]
fn test_model_trait_implementation() {
let ner = HeuristicNER::new();
assert_eq!(ner.name(), "heuristic");
assert!(ner.is_available());
assert!(!ner.supported_types().is_empty());
assert!(ner.description().contains("Heuristic"));
}
#[test]
fn test_entity_offsets_are_valid() {
let ner = HeuristicNER::new();
let text = "Barack Obama visited Berlin yesterday.";
let entities = ner.extract_entities(text, None).unwrap();
let char_count = text.chars().count();
for entity in &entities {
assert!(entity.start() <= entity.end(), "start should be <= end");
assert!(entity.end() <= char_count, "end should be within text");
let extracted: String = text
.chars()
.skip(entity.start())
.take(entity.end() - entity.start())
.collect();
assert_eq!(
extracted, entity.text,
"Extracted text should match entity text"
);
}
}
#[test]
fn test_unicode_text_handling() {
let ner = HeuristicNER::new();
let text = "François Müller from München met José García.";
let entities = ner.extract_entities(text, None).unwrap();
for entity in &entities {
let extracted: String = text
.chars()
.skip(entity.start())
.take(entity.end() - entity.start())
.collect();
assert_eq!(extracted, entity.text, "Unicode offsets should be correct");
}
}
#[test]
fn test_provenance_is_set() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Google announced today.", None)
.unwrap();
for entity in &entities {
if let Some(ref prov) = entity.provenance {
assert_eq!(prov.source, "heuristic");
assert!(matches!(prov.method, ExtractionMethod::Heuristic));
}
}
}
#[test]
fn test_is_acronym_word_latin() {
assert!(is_acronym_word("PARC"));
assert!(is_acronym_word("IBM"));
assert!(is_acronym_word("NASA"));
assert!(is_acronym_word("N2K"));
assert!(is_acronym_word("DARPA."));
assert!(is_acronym_word("(NATO)"));
assert!(!is_acronym_word("Xerox"));
assert!(!is_acronym_word("Lynn"));
assert!(!is_acronym_word("A"));
assert!(!is_acronym_word("42"));
assert!(!is_acronym_word(""));
}
#[test]
fn test_is_acronym_word_cyrillic() {
assert!(is_acronym_word("\u{041D}\u{0410}\u{0422}\u{041E}")); assert!(is_acronym_word("\u{041C}\u{0418}\u{0414}")); assert!(!is_acronym_word(
"\u{041C}\u{043E}\u{0441}\u{043A}\u{0432}\u{0430}"
)); }
#[test]
fn test_is_acronym_word_caseless_scripts() {
assert!(!is_acronym_word("\u{6771}\u{4EAC}")); assert!(!is_acronym_word("\u{30BD}\u{30CB}\u{30FC}")); assert!(!is_acronym_word("\u{062D}\u{0645}\u{0627}\u{0633}")); }
#[test]
fn test_acronym_in_multi_word_span_signals_org() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities(
"Lynn Conway worked at IBM and Xerox PARC in California.",
None,
)
.unwrap();
let xerox_parc = entities.iter().find(|e| e.text == "Xerox PARC");
assert!(
xerox_parc.is_some(),
"Should detect 'Xerox PARC': {entities:?}"
);
assert!(
matches!(xerox_parc.unwrap().entity_type, EntityType::Organization),
"Xerox PARC should be ORG, got {:?}",
xerox_parc.unwrap().entity_type,
);
}
#[test]
fn test_acronym_no_regression_on_normal_names() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Lynn Conway designed the processor.", None)
.unwrap();
let lynn = entities.iter().find(|e| e.text == "Lynn Conway");
assert!(lynn.is_some(), "Should detect 'Lynn Conway': {entities:?}");
assert!(
matches!(lynn.unwrap().entity_type, EntityType::Person),
"Lynn Conway should remain PER, got {:?}",
lynn.unwrap().entity_type,
);
}
#[test]
fn test_single_acronym_signals_org() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She joined DARPA last year.", None)
.unwrap();
let darpa = entities.iter().find(|e| e.text == "DARPA");
assert!(darpa.is_some(), "Should detect 'DARPA': {entities:?}");
assert!(
matches!(darpa.unwrap().entity_type, EntityType::Organization),
"DARPA should be ORG, got {:?}",
darpa.unwrap().entity_type,
);
}
#[test]
fn test_known_loc_acronym_still_loc() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She moved to USA last year.", None)
.unwrap();
let usa = entities.iter().find(|e| e.text == "USA");
assert!(usa.is_some(), "Should detect 'USA': {entities:?}");
assert!(
matches!(usa.unwrap().entity_type, EntityType::Location),
"USA should be LOC (gazetteer wins), got {:?}",
usa.unwrap().entity_type,
);
}
#[test]
fn test_international_org_suffix_gmbh() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Siemens GmbH reported earnings.", None)
.unwrap();
let orgs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Organization))
.collect();
assert!(!orgs.is_empty(), "Should detect 'Siemens GmbH' as ORG");
assert!(
orgs.iter().any(|e| e.text.contains("GmbH")),
"Entity text should include GmbH suffix: {orgs:?}"
);
}
#[test]
fn test_skip_word_filters_job_titles() {
let ner = HeuristicNER::with_threshold(0.0);
let entities = ner
.extract_entities("the CEO spoke at the event.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.iter().any(|t| t.eq_ignore_ascii_case("CEO")),
"CEO should be filtered as skip_word: {texts:?}"
);
}
#[test]
fn test_skip_pronoun_filters_single_pronouns() {
let ner = HeuristicNER::with_threshold(0.0);
let entities = ner
.extract_entities("He ran. She swam. They left.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
for pronoun in &["He", "She", "They"] {
assert!(
!texts.contains(pronoun),
"{pronoun} should be filtered: {texts:?}"
);
}
}
#[test]
fn test_single_letter_not_entity() {
let ner = HeuristicNER::with_threshold(0.0);
let entities = ner
.extract_entities("variable X was defined.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(&"X"),
"Single letter 'X' should be skipped: {texts:?}"
);
}
#[test]
fn test_three_word_span_defaults_to_org() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Global Dynamics Research announced funding.", None)
.unwrap();
let span = entities
.iter()
.find(|e| e.text == "Global Dynamics Research");
assert!(
span.is_some(),
"Should detect 'Global Dynamics Research': {entities:?}"
);
assert!(
matches!(span.unwrap().entity_type, EntityType::Organization),
"Three-word span should be ORG, got {:?}",
span.unwrap().entity_type,
);
}
#[test]
fn test_single_capitalized_mid_sentence_defaults_person() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("I spoke with Valentina about the plan.", None)
.unwrap();
let val = entities.iter().find(|e| e.text == "Valentina");
assert!(val.is_some(), "Should detect 'Valentina': {entities:?}");
assert!(
matches!(val.unwrap().entity_type, EntityType::Person),
"Single capitalized mid-sentence should be PER, got {:?}",
val.unwrap().entity_type,
);
}
#[test]
fn test_and_separates_entities() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("We met Alice and Bob at the event.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.iter().any(|t| t.contains("and")),
"'and' should separate entities, not join them: {texts:?}"
);
assert!(texts.contains(&"Alice"), "Should detect Alice: {texts:?}");
assert!(texts.contains(&"Bob"), "Should detect Bob: {texts:?}");
}
#[test]
fn test_german_preposition_location_context() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Er kommt aus Hamburg zum Meeting.", None)
.unwrap();
let locs: Vec<_> = entities
.iter()
.filter(|e| matches!(e.entity_type, EntityType::Location))
.collect();
assert!(
locs.iter().any(|e| e.text == "Hamburg"),
"German preposition 'aus' should signal LOC for Hamburg: {entities:?}"
);
}
#[test]
fn test_trailing_punctuation_stripped() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She met Google, Microsoft, and Tesla.", None)
.unwrap();
for entity in &entities {
assert!(
!entity.text.ends_with(','),
"Entity '{}' should not end with comma",
entity.text
);
assert!(
!entity.text.ends_with('.'),
"Entity '{}' should not end with period",
entity.text
);
}
}
#[test]
fn test_span_offsets_with_multiple_spaces() {
let ner = HeuristicNER::new();
let text = "Meeting with Barack Obama in Washington.";
let entities = ner.extract_entities(text, None).unwrap();
for entity in &entities {
let extracted: String = text
.chars()
.skip(entity.start())
.take(entity.end() - entity.start())
.collect();
assert!(entity.start() < entity.end(), "start < end");
assert!(
entity.end() <= text.chars().count(),
"end ({}) within text len ({})",
entity.end(),
text.chars().count()
);
let first_word = entity.text.split_whitespace().next().unwrap();
let last_word = entity.text.split_whitespace().last().unwrap();
assert!(
extracted.starts_with(first_word),
"Span '{}' should start with '{}' (entity: '{}')",
extracted,
first_word,
entity.text
);
assert!(
extracted.ends_with(last_word),
"Span '{}' should end with '{}' (entity: '{}')",
extracted,
last_word,
entity.text
);
}
}
#[test]
fn test_long_names_not_truncated() {
let ner = HeuristicNER::new();
let text = "Dr. Emmanuelle Charpentier won the prize.";
let entities = ner.extract_entities(text, None).unwrap();
let charpentier = entities.iter().find(|e| e.text.contains("Charpentier"));
assert!(
charpentier.is_some(),
"Should find Charpentier: {:?}",
entities
);
assert!(
charpentier.unwrap().text.contains("Charpentier"),
"Name should not be truncated: '{}'",
charpentier.unwrap().text
);
}
#[test]
fn test_unicode_name_offsets_correct() {
let ner = HeuristicNER::new();
let text = "François Müller presented the results.";
let entities = ner.extract_entities(text, None).unwrap();
for entity in &entities {
let extracted: String = text
.chars()
.skip(entity.start())
.take(entity.end() - entity.start())
.collect();
assert_eq!(
extracted, entity.text,
"Unicode char offsets must match entity text"
);
}
}
#[test]
fn test_leading_punct_char_count_not_bytes() {
let ner = HeuristicNER::new();
let text = "She said, \"Alice was there.\"";
let entities = ner.extract_entities(text, None).unwrap();
for entity in &entities {
assert!(
!entity.text.starts_with('"'),
"Entity '{}' should not start with quote",
entity.text
);
let extracted: String = text
.chars()
.skip(entity.start())
.take(entity.end() - entity.start())
.collect();
assert_eq!(
extracted, entity.text,
"Offsets should match after leading punct trim"
);
}
}
#[test]
fn test_day_names_not_entities() {
let ner = HeuristicNER::new();
let days = [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
];
for day in &days {
let text = format!("{} was a busy day at the office.", day);
let entities = ner.extract_entities(&text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(day),
"'{}' should not be extracted as entity in: '{}' (got: {:?})",
day,
text,
texts
);
}
}
#[test]
fn test_month_names_not_entities() {
let ner = HeuristicNER::new();
let months = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
];
for month in &months {
let text = format!("{} earnings exceeded expectations.", month);
let entities = ner.extract_entities(&text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(month),
"'{}' should not be extracted as entity (got: {:?})",
month,
texts
);
}
}
#[test]
fn test_month_after_preposition_not_loc() {
let ner = HeuristicNER::new();
let text = "Sales peaked in March and declined in December.";
let entities = ner.extract_entities(text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.iter().any(|t| *t == "March" || *t == "December"),
"Month names should not be LOC even after 'in': {:?}",
texts
);
}
#[test]
fn test_common_acronyms_not_entities() {
let ner = HeuristicNER::new();
let acronyms = [
"LCD", "LED", "USB", "DNA", "RNA", "CPU", "GPU", "HTML", "PDF",
];
for acr in &acronyms {
let text = format!("The {} technology was revolutionary.", acr);
let entities = ner.extract_entities(&text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(acr),
"'{}' should be filtered as common acronym, got: {:?}",
acr,
texts
);
}
}
#[test]
fn test_currency_codes_not_entities() {
let ner = HeuristicNER::new();
let codes = ["EUR", "GBP", "USD", "JPY", "CHF"];
for code in &codes {
let text = format!("The {} exchange rate dropped.", code);
let entities = ner.extract_entities(&text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(code),
"'{}' should be filtered as currency code acronym, got: {:?}",
code,
texts
);
}
}
#[test]
fn test_real_acronyms_still_detected() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She joined DARPA and later CERN.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(texts.contains(&"DARPA"), "DARPA should still be detected");
assert!(texts.contains(&"CERN"), "CERN should still be detected");
}
#[test]
fn test_hyphenated_acronym_compounds_not_entities() {
let ner = HeuristicNER::new();
let compounds = [
"DNA-based",
"LCD-equipped",
"USB-powered",
"GPU-accelerated",
];
for compound in &compounds {
let text = format!("The {} system performed well.", compound);
let entities = ner.extract_entities(&text, None).unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.contains(compound),
"'{}' should be filtered (acronym prefix), got: {:?}",
compound,
texts
);
}
}
#[test]
fn test_common_acronym_in_two_word_span_no_acronym_signal() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("She bought an Advanced USB yesterday.", None)
.unwrap();
let usb_span = entities.iter().find(|e| e.text.contains("USB"));
if let Some(span) = usb_span {
let prov = span.provenance.as_ref().and_then(|p| p.pattern.as_ref());
assert!(
prov.is_none_or(|p| p.as_ref() != "acronym_in_span"),
"Common acronym USB should not trigger acronym_in_span rule: {:?}",
span
);
}
}
#[test]
fn test_offset_validity_comprehensive() {
let ner = HeuristicNER::new();
let texts = [
"Barack Obama visited Berlin yesterday.",
"Dr. Emmanuelle Charpentier and Dr. Jennifer Doudna won the Nobel Prize.",
"Nintendo reported EUR 1.2 million in revenue on Thursday.",
"The LCD screens use LED backlighting with USB-C connectors.",
"François Müller from München met José García in São Paulo.",
"Google, Microsoft, and Tesla announced partnerships.",
"She said, \"Alice was there.\"",
"Bank of America reported (Q3) earnings for Apple Inc.",
];
for text in &texts {
let entities = ner.extract_entities(text, None).unwrap();
let char_count = text.chars().count();
for entity in &entities {
assert!(
entity.start() < entity.end(),
"start ({}) < end ({}) for '{}' in '{}'",
entity.start(),
entity.end(),
entity.text,
text
);
assert!(
entity.end() <= char_count,
"end ({}) <= text len ({}) for '{}' in '{}'",
entity.end(),
char_count,
entity.text,
text
);
}
}
}
#[test]
fn test_title_prefixed_name_is_person() {
let ner = HeuristicNER::new();
let cases = [
(
"CEO Shuntaro Furukawa announced the partnership.",
"CEO Shuntaro Furukawa",
),
(
"President Barack Obama signed the bill.",
"President Barack Obama",
),
("Chairman Li Wei addressed shareholders.", "Chairman Li Wei"),
];
for (text, expected_fragment) in &cases {
let entities = ner.extract_entities(text, None).unwrap();
let match_entity = entities
.iter()
.find(|e| e.text.contains(expected_fragment) || expected_fragment.contains(&*e.text));
assert!(
match_entity.is_some(),
"Should detect '{}' in '{}', got: {:?}",
expected_fragment,
text,
entities.iter().map(|e| &e.text).collect::<Vec<_>>()
);
if let Some(entity) = match_entity {
assert!(
matches!(entity.entity_type, EntityType::Person),
"'{}' should be PER, got {:?}",
entity.text,
entity.entity_type
);
}
}
}
#[test]
fn test_of_pattern_still_org() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Bank of America reported earnings.", None)
.unwrap();
let boa = entities.iter().find(|e| e.text.contains("Bank of America"));
assert!(boa.is_some(), "Should detect Bank of America");
assert!(
matches!(boa.unwrap().entity_type, EntityType::Organization),
"Bank of America should be ORG"
);
}
#[test]
fn test_standalone_prefix_skipped() {
let ner = HeuristicNER::new();
let entities = ner
.extract_entities("Dr. Jennifer Doudna won the Nobel Prize.", None)
.unwrap();
let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
assert!(
!texts.iter().any(|t| *t == "Dr" || *t == "Dr."),
"Standalone 'Dr' should be skipped, got: {:?}",
texts
);
assert!(
texts
.iter()
.any(|t| t.contains("Jennifer") || t.contains("Doudna")),
"Should detect the full name, got: {:?}",
texts
);
}
#[test]
fn test_standalone_person_prefixes_skipped() {
let ner = HeuristicNER::new();
let prefixes = ["Dr", "Mr", "Mrs", "Prof"];
for prefix in &prefixes {
let text = format!("{} went home.", prefix);
let entities = ner.extract_entities(&text, None).unwrap();
let has_prefix_entity = entities
.iter()
.any(|e| e.text.trim_end_matches('.') == *prefix);
assert!(
!has_prefix_entity,
"Standalone '{}' should be skipped, got: {:?}",
prefix,
entities.iter().map(|e| &e.text).collect::<Vec<_>>()
);
}
}
#[test]
fn fiscal_quarter_not_tagged_as_entity() {
let ner = HeuristicNER::new();
for q in &["Q1", "Q2", "Q3", "Q4"] {
let text = format!("{} revenue increased by 10%.", q);
let entities = ner.extract_entities(&text, None).unwrap();
let has_q = entities.iter().any(|e| e.text == *q);
assert!(
!has_q,
"'{}' should not be tagged as an entity, got: {:?}",
q,
entities.iter().map(|e| &e.text).collect::<Vec<_>>()
);
}
}
#[test]
fn fiscal_quarter_multi_word_not_entity() {
let ner = super::HeuristicNER::new();
for pattern in &["Q3 FY2025", "Q1 2024", "Q4 FY2023", "Q2 H1"] {
let text = format!("The company reported {} earnings grew.", pattern);
let entities = ner.extract_entities(&text, None).unwrap();
let has_fiscal = entities
.iter()
.any(|e| e.text.starts_with('Q') && e.text.contains(pattern));
assert!(
!has_fiscal,
"'{}' should not be tagged as an entity in: {:?}",
pattern,
entities.iter().map(|e| &e.text).collect::<Vec<_>>()
);
}
}
#[test]
fn economic_acronyms_not_entities() {
let ner = super::HeuristicNER::new();
for acronym in &["GDP", "GNP", "CPI", "ROI", "EBITDA", "IPO", "ETF"] {
let text = format!("The {} grew by 3% this quarter.", acronym);
let entities = ner.extract_entities(&text, None).unwrap();
let has_acronym = entities.iter().any(|e| e.text == *acronym);
assert!(
!has_acronym,
"'{}' should not be tagged as an entity, got: {:?}",
acronym,
entities.iter().map(|e| &e.text).collect::<Vec<_>>()
);
}
}
#[test]
fn org_suffix_services_technologies() {
let ner = super::HeuristicNER::new();
for name in &[
"Amazon Web Services",
"Palantir Technologies",
"General Dynamics Systems",
] {
let text = format!("{} announced a new product.", name);
let entities = ner.extract_entities(&text, None).unwrap();
let found = entities
.iter()
.any(|e| e.text.contains(name.split_whitespace().last().unwrap()));
assert!(
found,
"Should detect org suffix in '{}', got: {:?}",
name,
entities
.iter()
.map(|e| (&e.text, &e.entity_type))
.collect::<Vec<_>>()
);
}
}