1use crate::{Entity, EntityType, ExtractionMethod, Model, Provenance, Result};
13
14#[derive(Debug, Clone)]
19pub struct HeuristicNER {
20 threshold: f64,
22}
23
24impl Default for HeuristicNER {
25 fn default() -> Self {
26 Self { threshold: 0.35 }
27 }
28}
29
30impl HeuristicNER {
31 #[must_use]
33 pub fn new() -> Self {
34 Self::default()
35 }
36
37 #[must_use]
39 pub fn with_threshold(threshold: f64) -> Self {
40 Self { threshold }
41 }
42}
43
44const ORG_SUFFIX: &[&str] = &[
46 "inc.",
47 "inc",
48 "corp.",
49 "corp",
50 "ltd.",
51 "ltd",
52 "llc",
53 "co.",
54 "plc",
55 "foundation",
56 "institute",
57 "university",
58 "college",
59 "bank",
60 "group",
61 "agency",
62 "gmbh",
64 "ag",
65 "kg",
66 "sa",
67 "s.a.",
68 "s.l.",
69 "s.r.l.",
70 "spa",
71 "nv",
72 "bv",
73 "pty",
74 "ab",
75 "limited",
76 "corporation",
77 "incorporated",
78 "company",
79 "holding",
80 "holdings",
81];
82const PERSON_PREFIX: &[&str] = &[
83 "mr.", "mr", "ms.", "ms", "mrs.", "mrs", "dr.", "dr", "prof.", "prof",
84];
85const LOC_PREPOSITION: &[&str] = &[
86 "in", "from", "at", "to", "near", "aus", "nach", "bei", "von", "en", "de", "à", "dans", "por", "sur",
89];
90#[allow(dead_code)] const SKIP_WORDS: &[&str] = &[
93 "ceo",
94 "cto",
95 "cfo",
96 "vp",
97 "president",
98 "chairman",
99 "director",
100];
101
102const COMMON_SENTENCE_STARTERS: &[&str] = &[
104 "the",
105 "a",
106 "an",
107 "this",
108 "that",
109 "these",
110 "those",
111 "it",
112 "he",
113 "she",
114 "we",
115 "they",
116 "in",
117 "on",
118 "at",
119 "to",
120 "for",
121 "from",
122 "by",
123 "with",
124 "and",
125 "but",
126 "or",
127 "so",
128 "yet",
129 "if",
130 "because",
131 "contact",
132 "call",
133 "email",
134 "visit",
135 "please",
136 "see",
137 "note",
138 "today",
139 "yesterday",
140 "tomorrow",
141 "now",
142 "then",
143 "what",
144 "where",
145 "when",
146 "who",
147 "why",
148 "how",
149 "is",
150 "are",
151 "was",
152 "were",
153 "be",
154 "been",
155 "have",
156 "has",
157 "had",
158];
159
160#[allow(dead_code)] const KNOWN_ORGS: &[&str] = &[
164 "google",
165 "apple",
166 "microsoft",
167 "amazon",
168 "facebook",
169 "meta",
170 "tesla",
171 "twitter",
172 "ibm",
173 "intel",
174 "nvidia",
175 "oracle",
176 "cisco",
177 "samsung",
178 "sony",
179 "toyota",
180 "honda",
181 "bmw",
182 "mercedes",
183 "volkswagen",
184 "nasa",
185 "fbi",
186 "cia",
187 "nsa",
188 "nato",
189 "un",
190 "eu",
191 "bbc",
192 "cnn",
193 "nbc",
194 "cbs",
195 "abc",
196 "fox",
197 "nyt",
198 "wsj",
199 "reuters",
200 "bloomberg",
201 "spotify",
202 "netflix",
203 "uber",
204 "airbnb",
205 "paypal",
206 "visa",
207 "mastercard",
208 "amex",
209 "ソニー",
211 "トヨタ",
212 "ホンダ",
213 "任天堂",
214 "サムスン",
215 "ファーウェイ",
216 "アリババ",
217 "テンセント",
218 "华为",
219 "阿里巴巴",
220 "腾讯",
221 "百度",
222 "小米",
223];
224
225#[allow(dead_code)] const KNOWN_LOCS: &[&str] = &[
227 "paris",
228 "london",
229 "tokyo",
230 "berlin",
231 "rome",
232 "madrid",
233 "moscow",
234 "beijing",
235 "shanghai",
236 "dubai",
237 "singapore",
238 "sydney",
239 "toronto",
240 "chicago",
241 "boston",
242 "california",
243 "texas",
244 "florida",
245 "new york",
246 "washington",
247 "europe",
248 "asia",
249 "africa",
250 "america",
251 "australia",
252 "china",
253 "india",
254 "japan",
255 "germany",
256 "france",
257 "italy",
258 "spain",
259 "brazil",
260 "mexico",
261 "russia",
262 "korea",
263 "canada",
264 "uk",
265 "usa",
266 "東京",
268 "大阪",
269 "京都",
270 "北京",
271 "上海",
272 "香港",
273 "ソウル",
274 "台北",
275 "中国",
276 "日本",
277 "韓国",
278 "アメリカ",
279 "イギリス",
280 "フランス",
281 "ドイツ",
282];
283
284#[allow(dead_code)] const KNOWN_PERSONS: &[&str] = &[
286 "john", "jane", "mary", "james", "robert", "michael", "william", "david", "richard", "joseph",
287 "thomas", "charles", "barack", "donald", "joe", "george", "bill", "vladimir", "emmanuel",
288 "boris", "narendra", "justin", "elon", "jeff", "mark", "steve", "tim", "satya", "sundar",
289 "albert", "isaac", "stephen", "neil", "peter", "paul", "matthew", "andrew", "philip", "simon",
290 "marie", "angela", "hillary", "nancy", "kamala", "michelle", "melania", "jill", "theresa",
291 "ursula",
292];
293
294impl Model for HeuristicNER {
295 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
296 if text.is_empty() {
297 return Ok(vec![]);
298 }
299
300 let mut entities: Vec<Entity> = Vec::new();
301
302 let has_cjk = text.chars().any(
305 |c| {
306 ('\u{4e00}'..='\u{9fff}').contains(&c) || ('\u{3040}'..='\u{309f}').contains(&c) || ('\u{30a0}'..='\u{30ff}').contains(&c)
309 }, );
311
312 if has_cjk {
313 let converter = crate::offset::SpanConverter::new(text);
316
317 for &org in KNOWN_ORGS {
318 if org.chars().any(|c| c >= '\u{3040}') {
320 let org_char_count = if org.is_ascii() {
321 org.len()
322 } else {
323 org.chars().count()
324 };
325
326 for (start_byte, _) in text.match_indices(org) {
329 let char_start = converter.byte_to_char(start_byte);
330 let char_end = char_start + org_char_count;
331 if !entities
333 .iter()
334 .any(|e| e.start == char_start && e.end == char_end)
335 {
336 entities.push(Entity::new(
337 org.to_string(),
338 EntityType::Organization,
339 char_start,
340 char_end,
341 0.9,
342 ));
343 }
344 }
345 }
346 }
347 for &loc in KNOWN_LOCS {
348 if loc.chars().any(|c| c >= '\u{3040}') {
349 let loc_char_count = if loc.is_ascii() {
350 loc.len()
351 } else {
352 loc.chars().count()
353 };
354
355 for (start_byte, _) in text.match_indices(loc) {
357 let char_start = converter.byte_to_char(start_byte);
358 let char_end = char_start + loc_char_count;
359 if !entities
360 .iter()
361 .any(|e| e.start == char_start && e.end == char_end)
362 {
363 entities.push(Entity::new(
364 loc.to_string(),
365 EntityType::Location,
366 char_start,
367 char_end,
368 0.9,
369 ));
370 }
371 }
372 }
373 }
374 }
375
376 let mut words_with_pos: Vec<(&str, usize, usize)> = Vec::new();
380
381 let mut in_word = false;
382 let mut word_start_byte = 0;
383 let mut word_start_char = 0;
384 let mut char_pos = 0;
385
386 for (i, c) in text.char_indices() {
387 if c.is_whitespace() {
388 if in_word {
389 let word = &text[word_start_byte..i];
391 words_with_pos.push((word, word_start_char, char_pos));
392 in_word = false;
393 }
394 } else if !in_word {
395 in_word = true;
396 word_start_byte = i;
397 word_start_char = char_pos;
398 }
399 char_pos += 1;
400 }
401 if in_word {
403 let word = &text[word_start_byte..];
404 words_with_pos.push((word, word_start_char, char_pos));
405 }
406
407 let words: Vec<&str> = words_with_pos.iter().map(|(w, _, _)| *w).collect();
408
409 let mut i = 0;
410 while i < words.len() {
411 let word = words[i];
412
413 let clean_leading = word.trim_start_matches(|c: char| !c.is_alphanumeric());
415 if clean_leading.is_empty() {
416 i += 1;
417 continue;
418 }
419
420 if !clean_leading
422 .chars()
423 .next()
424 .map(|c| c.is_uppercase())
425 .unwrap_or(false)
426 {
427 i += 1;
428 continue;
429 }
430
431 let start_idx = i;
434
435 let first_word_lower = word.to_lowercase();
437 let first_word_clean = first_word_lower.trim_matches(|c: char| !c.is_alphanumeric());
438 if COMMON_SENTENCE_STARTERS.contains(&first_word_clean) {
439 i += 1;
440 continue;
441 }
442
443 while i < words.len() {
444 let w = words[i];
445 let w_clean = w.trim_start_matches(|c: char| !c.is_alphanumeric());
446
447 let ends_with_closing = w.ends_with([')', ']', '}']);
452 let ends_with_punct = w.ends_with(['.', '!', '?']);
453
454 let first_char_upper = w_clean
455 .chars()
456 .next()
457 .map(|c| c.is_uppercase())
458 .unwrap_or(false);
459
460 let is_connector = matches!(w.to_lowercase().as_str(), "of" | "the");
463
464 let next_word_ok = if i + 1 < words.len() {
466 let next = words[i + 1];
467 let next_clean = next.trim_start_matches(|c: char| !c.is_alphanumeric());
468 let next_upper = next_clean
469 .chars()
470 .next()
471 .map(|c| c.is_uppercase())
472 .unwrap_or(false);
473
474 let is_suffix = ORG_SUFFIX.contains(&&*next_clean.to_lowercase());
477
478 if (ends_with_closing || ends_with_punct) && !is_suffix {
479 false } else {
481 next_upper
482 }
483 } else {
484 false
485 };
486
487 if first_char_upper || (is_connector && next_word_ok) {
488 i += 1;
489 if ends_with_closing || ends_with_punct {
492 let is_suffix_next = if let Some(next_w) = words.get(i) {
493 let clean = next_w.to_lowercase();
494 let clean_ref = clean.trim_matches(|c: char| !c.is_alphanumeric());
495 ORG_SUFFIX.contains(&clean_ref)
496 } else {
497 false
498 };
499
500 if !is_suffix_next {
501 break;
502 }
503 }
504 } else {
505 break;
506 }
507 }
508 let end_idx = i;
509
510 if start_idx == end_idx {
511 continue;
512 }
513
514 let span_words = &words[start_idx..end_idx];
516 let mut entity_text = span_words.join(" ");
517
518 let prev_word = if start_idx > 0 {
520 Some(
521 words[start_idx - 1]
522 .to_lowercase()
523 .trim_end_matches('.')
524 .to_string(),
525 )
526 } else {
527 None
528 };
529 let should_include_prefix = prev_word
530 .as_ref()
531 .map(|p| PERSON_PREFIX.contains(&p.as_str()))
532 .unwrap_or(false);
533
534 if should_include_prefix {
536 let prefix_word = &words[start_idx - 1];
537 entity_text = format!("{} {}", prefix_word, entity_text);
538 let prefix_char_start = words_with_pos[start_idx - 1].1;
540 let char_start = prefix_char_start;
541 let char_end = char_start + entity_text.chars().count();
542
543 let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
545 let (entity_type, confidence, reason) =
546 classify_minimal(&clean_span_words, &words, start_idx - 1);
547
548 if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
550 entities.push(Entity::with_provenance(
551 entity_text,
552 entity_type,
553 char_start,
554 char_end,
555 confidence,
556 Provenance {
557 source: "heuristic".into(),
558 method: ExtractionMethod::Heuristic,
559 pattern: Some(reason.into()),
560 raw_confidence: Some(confidence),
561 model_version: None,
562 timestamp: None,
563 },
564 ));
565 }
566 continue; }
568
569 let leading_punct_len = entity_text.len()
571 - entity_text
572 .trim_start_matches(|c: char| !c.is_alphanumeric())
573 .len();
574 if leading_punct_len > 0 {
575 entity_text = entity_text[leading_punct_len..].to_string();
576 }
577
578 while entity_text.ends_with(|c: char| !c.is_alphanumeric()) {
580 entity_text.pop();
581 }
582
583 if entity_text.is_empty() {
585 continue;
586 }
587
588 let char_start = words_with_pos[start_idx].1 + leading_punct_len;
591 let char_end = char_start
593 + if entity_text.is_ascii() {
594 entity_text.len()
595 } else {
596 entity_text.chars().count()
597 };
598
599 let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
602 let (entity_type, confidence, reason) =
603 classify_minimal(&clean_span_words, &words, start_idx);
604
605 if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
607 entities.push(Entity::with_provenance(
608 entity_text,
609 entity_type,
610 char_start,
611 char_end,
612 confidence,
613 Provenance {
614 source: "heuristic".into(),
615 method: ExtractionMethod::Heuristic,
616 pattern: Some(reason.into()),
617 raw_confidence: Some(confidence),
618 model_version: None,
619 timestamp: None,
620 },
621 ));
622 }
623 }
624
625 Ok(entities)
626 }
627
628 fn supported_types(&self) -> Vec<EntityType> {
629 vec![
630 EntityType::Person,
631 EntityType::Organization,
632 EntityType::Location,
633 ]
634 }
635
636 fn is_available(&self) -> bool {
637 true
638 }
639
640 fn name(&self) -> &'static str {
641 "heuristic"
642 }
643
644 fn description(&self) -> &'static str {
645 "Heuristic NER optimized for low complexity"
646 }
647}
648
649fn is_acronym_word(w: &str) -> bool {
654 let clean = w.trim_matches(|c: char| !c.is_alphanumeric());
655 let alpha_count = clean.chars().filter(|c| c.is_alphabetic()).count();
656 alpha_count >= 2
657 && clean
658 .chars()
659 .filter(|c| c.is_alphabetic())
660 .all(|c| c.is_uppercase())
661}
662
663fn classify_minimal(
664 span: &[&str],
665 all_words: &[&str],
666 start_idx: usize,
667) -> (EntityType, f64, &'static str) {
668 let last_word = span.last().map(|s| s.to_lowercase()).unwrap_or_default();
669 let first_word = span.first().map(|s| s.to_lowercase()).unwrap_or_default();
670 let span_lower = span
671 .iter()
672 .map(|s| s.to_lowercase())
673 .collect::<Vec<_>>()
674 .join(" ");
675
676 let prev_word = if start_idx > 0 {
678 Some(all_words[start_idx - 1].to_lowercase())
679 } else {
680 None
681 };
682
683 let skip_pronouns = [
685 "the", "a", "an", "he", "she", "it", "they", "we", "i", "you",
686 ];
687 if span.len() == 1 && skip_pronouns.contains(&first_word.as_str()) {
688 return (EntityType::Other("skip".into()), 0.0, "skip_pronoun");
689 }
690 let first_clean_lc = first_word
692 .trim_end_matches(|c: char| !c.is_alphanumeric())
693 .to_lowercase();
694 if span.len() == 1 && SKIP_WORDS.contains(&first_clean_lc.as_str()) {
695 return (EntityType::Other("skip".into()), 0.0, "skip_word");
696 }
697
698 let last_clean: &str = last_word.trim_end_matches(|c: char| !c.is_alphanumeric());
700 if ORG_SUFFIX.contains(&last_clean) {
701 return (EntityType::Organization, 0.85, "org_suffix");
702 }
703
704 let first_clean_text = first_word.trim_end_matches(|c: char| !c.is_alphanumeric());
706 if KNOWN_ORGS.contains(&first_clean_text) || KNOWN_ORGS.contains(&span_lower.as_str()) {
707 return (EntityType::Organization, 0.80, "known_org");
708 }
709
710 if KNOWN_LOCS.contains(&first_clean_text) || KNOWN_LOCS.contains(&span_lower.as_str()) {
712 return (EntityType::Location, 0.80, "known_location");
713 }
714
715 if KNOWN_PERSONS.contains(&first_clean_text) {
717 return (EntityType::Person, 0.75, "common_name");
718 }
719
720 if let Some(prev) = &prev_word {
722 let prev_clean: &str = prev.trim_end_matches('.');
723 if PERSON_PREFIX.contains(&prev_clean) {
724 return (EntityType::Person, 0.80, "person_prefix_context");
725 }
726 }
727
728 let first_clean: &str = first_word.trim_end_matches('.');
730 if PERSON_PREFIX.contains(&first_clean) && span.len() >= 2 {
731 return (EntityType::Person, 0.75, "person_prefix_span");
732 }
733
734 if span.len() >= 2 {
737 let has_real_acronym = span.iter().any(|w| {
738 is_acronym_word(w) && {
739 let lc = w.to_lowercase();
740 !SKIP_WORDS.contains(&lc.trim_matches(|c: char| !c.is_alphanumeric()))
741 }
742 });
743 if has_real_acronym {
744 return (EntityType::Organization, 0.70, "acronym_in_span");
745 }
746 }
747
748 if let Some(prev) = &prev_word {
750 if LOC_PREPOSITION.contains(&prev.as_str()) {
751 return (EntityType::Location, 0.70, "loc_context");
752 }
753 }
754
755 if span.len() == 2 {
758 let place_indicators = ["united", "new", "south", "north", "west", "east", "great"];
759 if place_indicators.contains(&first_word.as_str()) {
760 return (EntityType::Location, 0.65, "loc_indicator");
761 }
762 return (EntityType::Person, 0.60, "two_word_name");
763 }
764
765 if span.len() >= 3 {
767 if span.len() >= 2 && span[1].to_lowercase() == "of" {
769 return (EntityType::Organization, 0.65, "org_of_pattern");
770 }
771 return (EntityType::Organization, 0.50, "long_span_org");
772 }
773
774 if span.len() == 1 {
776 let word = span[0].trim_matches(|c: char| !c.is_alphanumeric());
777 if word.len() == 1 {
778 return (EntityType::Other("skip".into()), 0.0, "single_letter");
779 }
780 if is_acronym_word(word) {
781 let lc = word.to_lowercase();
782 if !SKIP_WORDS.contains(&lc.as_str()) {
783 return (EntityType::Organization, 0.55, "single_acronym");
784 }
785 }
786 }
787
788 if start_idx == 0 && prev_word.is_none() {
790 return (EntityType::Person, 0.30, "single_start_word");
791 }
792
793 (EntityType::Person, 0.45, "capitalized")
795}
796
797impl crate::NamedEntityCapable for HeuristicNER {}
798
799impl crate::BatchCapable for HeuristicNER {
804 fn optimal_batch_size(&self) -> Option<usize> {
805 Some(16) }
807}
808
809impl crate::StreamingCapable for HeuristicNER {
814 fn recommended_chunk_size(&self) -> usize {
815 8192 }
817}
818
819#[cfg(test)]
820mod tests {
821 use super::*;
822
823 #[test]
824 fn test_basic_person_detection() {
825 let ner = HeuristicNER::new();
826 let entities = ner
827 .extract_entities("Dr. John Smith met with Mary.", None)
828 .unwrap();
829
830 let names: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
831 assert!(
832 names
833 .iter()
834 .any(|n| n.contains("John") || n.contains("Smith")),
835 "Should detect John Smith: {:?}",
836 names
837 );
838 }
839
840 #[test]
841 fn test_organization_suffix_detection() {
842 let ner = HeuristicNER::new();
843 let entities = ner
844 .extract_entities("Apple Inc. announced new products.", None)
845 .unwrap();
846
847 let orgs: Vec<_> = entities
848 .iter()
849 .filter(|e| matches!(e.entity_type, EntityType::Organization))
850 .collect();
851 assert!(!orgs.is_empty(), "Should detect Apple Inc. as organization");
852 }
853
854 #[test]
855 fn test_location_preposition_context() {
856 let ner = HeuristicNER::new();
857 let entities = ner
858 .extract_entities("She lived in Paris for years.", None)
859 .unwrap();
860
861 let locs: Vec<_> = entities
862 .iter()
863 .filter(|e| matches!(e.entity_type, EntityType::Location))
864 .collect();
865 assert!(!locs.is_empty(), "Should detect Paris as location");
866 }
867
868 #[test]
869 fn test_known_organizations() {
870 let ner = HeuristicNER::new();
871 let entities = ner
872 .extract_entities("Google and Microsoft competed.", None)
873 .unwrap();
874
875 let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
876 assert!(
877 texts.iter().any(|t| t.contains("Google")),
878 "Should detect Google"
879 );
880 assert!(
881 texts.iter().any(|t| t.contains("Microsoft")),
882 "Should detect Microsoft"
883 );
884 }
885
886 #[test]
887 fn test_cjk_organization_detection() {
888 let ner = HeuristicNER::new();
889 let entities = ner
890 .extract_entities("ソニーが新製品を発表しました。", None)
891 .unwrap();
892
893 let orgs: Vec<_> = entities
894 .iter()
895 .filter(|e| matches!(e.entity_type, EntityType::Organization))
896 .collect();
897 assert!(
898 !orgs.is_empty(),
899 "Should detect Sony (ソニー) as organization"
900 );
901 }
902
903 #[test]
904 fn test_cjk_location_detection() {
905 let ner = HeuristicNER::new();
906 let entities = ner
907 .extract_entities("東京オリンピックが開催された。", None)
908 .unwrap();
909
910 let locs: Vec<_> = entities
911 .iter()
912 .filter(|e| matches!(e.entity_type, EntityType::Location))
913 .collect();
914 assert!(!locs.is_empty(), "Should detect Tokyo (東京) as location");
915 }
916
917 #[test]
918 fn test_empty_text() {
919 let ner = HeuristicNER::new();
920 let entities = ner.extract_entities("", None).unwrap();
921 assert!(entities.is_empty());
922 }
923
924 #[test]
925 fn test_no_entities() {
926 let ner = HeuristicNER::new();
927 let entities = ner
928 .extract_entities("the quick brown fox jumps over the lazy dog", None)
929 .unwrap();
930 assert!(
932 entities.is_empty(),
933 "Lowercase text should have no entities"
934 );
935 }
936
937 #[test]
938 fn test_threshold_filtering() {
939 let low_threshold = HeuristicNER::with_threshold(0.1);
940 let high_threshold = HeuristicNER::with_threshold(0.9);
941
942 let text = "John works at Google.";
943 let low_entities = low_threshold.extract_entities(text, None).unwrap();
944 let high_entities = high_threshold.extract_entities(text, None).unwrap();
945
946 assert!(low_entities.len() >= high_entities.len());
948 }
949
950 #[test]
951 fn test_sentence_starter_filtering() {
952 let ner = HeuristicNER::new();
953 let entities = ner
954 .extract_entities("The dog ran. It was fast.", None)
955 .unwrap();
956
957 let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
959 assert!(
960 !texts.contains(&"The"),
961 "Should filter 'The' as sentence starter"
962 );
963 assert!(!texts.contains(&"It"), "Should filter 'It' as pronoun");
964 }
965
966 #[test]
967 fn test_person_prefix_detection() {
968 let ner = HeuristicNER::new();
969 let entities = ner
970 .extract_entities("Prof. Einstein presented the theory.", None)
971 .unwrap();
972
973 let persons: Vec<_> = entities
974 .iter()
975 .filter(|e| matches!(e.entity_type, EntityType::Person))
976 .collect();
977 assert!(
978 !persons.is_empty(),
979 "Should detect Prof. Einstein as person"
980 );
981 }
982
983 #[test]
984 fn test_multi_word_organization() {
985 let ner = HeuristicNER::new();
986 let entities = ner
987 .extract_entities("Bank of America provides services.", None)
988 .unwrap();
989
990 let orgs: Vec<_> = entities
991 .iter()
992 .filter(|e| matches!(e.entity_type, EntityType::Organization))
993 .collect();
994 assert!(!orgs.is_empty(), "Should detect 'Bank of America' pattern");
995 }
996
997 #[test]
998 fn test_location_indicators() {
999 let ner = HeuristicNER::new();
1000 let entities = ner
1001 .extract_entities("New Zealand is beautiful.", None)
1002 .unwrap();
1003
1004 let locs: Vec<_> = entities
1005 .iter()
1006 .filter(|e| matches!(e.entity_type, EntityType::Location))
1007 .collect();
1008 assert!(!locs.is_empty(), "Should detect 'New Zealand' as location");
1009 }
1010
1011 #[test]
1012 fn test_model_trait_implementation() {
1013 let ner = HeuristicNER::new();
1014
1015 assert_eq!(ner.name(), "heuristic");
1016 assert!(ner.is_available());
1017 assert!(!ner.supported_types().is_empty());
1018 assert!(ner.description().contains("Heuristic"));
1019 }
1020
1021 #[test]
1022 fn test_entity_offsets_are_valid() {
1023 let ner = HeuristicNER::new();
1024 let text = "Barack Obama visited Berlin yesterday.";
1025 let entities = ner.extract_entities(text, None).unwrap();
1026
1027 let char_count = text.chars().count();
1028 for entity in &entities {
1029 assert!(entity.start <= entity.end, "start should be <= end");
1030 assert!(entity.end <= char_count, "end should be within text");
1031
1032 let extracted: String = text
1034 .chars()
1035 .skip(entity.start)
1036 .take(entity.end - entity.start)
1037 .collect();
1038 assert_eq!(
1039 extracted, entity.text,
1040 "Extracted text should match entity text"
1041 );
1042 }
1043 }
1044
1045 #[test]
1046 fn test_unicode_text_handling() {
1047 let ner = HeuristicNER::new();
1048 let text = "François Müller from München met José García.";
1049 let entities = ner.extract_entities(text, None).unwrap();
1050
1051 for entity in &entities {
1053 let extracted: String = text
1054 .chars()
1055 .skip(entity.start)
1056 .take(entity.end - entity.start)
1057 .collect();
1058 assert_eq!(extracted, entity.text, "Unicode offsets should be correct");
1059 }
1060 }
1061
1062 #[test]
1063 fn test_provenance_is_set() {
1064 let ner = HeuristicNER::new();
1065 let entities = ner
1066 .extract_entities("Google announced today.", None)
1067 .unwrap();
1068
1069 for entity in &entities {
1070 if let Some(ref prov) = entity.provenance {
1071 assert_eq!(prov.source, "heuristic");
1072 assert!(matches!(prov.method, ExtractionMethod::Heuristic));
1073 }
1074 }
1075 }
1076
1077 #[test]
1082 fn test_is_acronym_word_latin() {
1083 assert!(is_acronym_word("PARC"));
1084 assert!(is_acronym_word("IBM"));
1085 assert!(is_acronym_word("NASA"));
1086 assert!(is_acronym_word("N2K"));
1087 assert!(is_acronym_word("DARPA."));
1088 assert!(is_acronym_word("(NATO)"));
1089 assert!(!is_acronym_word("Xerox"));
1090 assert!(!is_acronym_word("Lynn"));
1091 assert!(!is_acronym_word("A"));
1092 assert!(!is_acronym_word("42"));
1093 assert!(!is_acronym_word(""));
1094 }
1095
1096 #[test]
1097 fn test_is_acronym_word_cyrillic() {
1098 assert!(is_acronym_word("\u{041D}\u{0410}\u{0422}\u{041E}")); assert!(is_acronym_word("\u{041C}\u{0418}\u{0414}")); assert!(!is_acronym_word("\u{041C}\u{043E}\u{0441}\u{043A}\u{0432}\u{0430}")); }
1102
1103 #[test]
1104 fn test_is_acronym_word_caseless_scripts() {
1105 assert!(!is_acronym_word("\u{6771}\u{4EAC}")); assert!(!is_acronym_word("\u{30BD}\u{30CB}\u{30FC}")); assert!(!is_acronym_word("\u{062D}\u{0645}\u{0627}\u{0633}")); }
1109
1110 #[test]
1111 fn test_acronym_in_multi_word_span_signals_org() {
1112 let ner = HeuristicNER::new();
1113 let entities = ner
1114 .extract_entities(
1115 "Lynn Conway worked at IBM and Xerox PARC in California.",
1116 None,
1117 )
1118 .unwrap();
1119 let xerox_parc = entities.iter().find(|e| e.text == "Xerox PARC");
1120 assert!(xerox_parc.is_some(), "Should detect 'Xerox PARC': {entities:?}");
1121 assert!(
1122 matches!(xerox_parc.unwrap().entity_type, EntityType::Organization),
1123 "Xerox PARC should be ORG, got {:?}",
1124 xerox_parc.unwrap().entity_type,
1125 );
1126 }
1127
1128 #[test]
1129 fn test_acronym_no_regression_on_normal_names() {
1130 let ner = HeuristicNER::new();
1131 let entities = ner
1132 .extract_entities("Lynn Conway designed the processor.", None)
1133 .unwrap();
1134 let lynn = entities.iter().find(|e| e.text == "Lynn Conway");
1135 assert!(lynn.is_some(), "Should detect 'Lynn Conway': {entities:?}");
1136 assert!(
1137 matches!(lynn.unwrap().entity_type, EntityType::Person),
1138 "Lynn Conway should remain PER, got {:?}",
1139 lynn.unwrap().entity_type,
1140 );
1141 }
1142
1143 #[test]
1144 fn test_single_acronym_signals_org() {
1145 let ner = HeuristicNER::new();
1146 let entities = ner
1147 .extract_entities("She joined DARPA last year.", None)
1148 .unwrap();
1149 let darpa = entities.iter().find(|e| e.text == "DARPA");
1150 assert!(darpa.is_some(), "Should detect 'DARPA': {entities:?}");
1151 assert!(
1152 matches!(darpa.unwrap().entity_type, EntityType::Organization),
1153 "DARPA should be ORG, got {:?}",
1154 darpa.unwrap().entity_type,
1155 );
1156 }
1157
1158 #[test]
1159 fn test_known_loc_acronym_still_loc() {
1160 let ner = HeuristicNER::new();
1161 let entities = ner
1162 .extract_entities("She moved to USA last year.", None)
1163 .unwrap();
1164 let usa = entities.iter().find(|e| e.text == "USA");
1165 assert!(usa.is_some(), "Should detect 'USA': {entities:?}");
1166 assert!(
1167 matches!(usa.unwrap().entity_type, EntityType::Location),
1168 "USA should be LOC (gazetteer wins), got {:?}",
1169 usa.unwrap().entity_type,
1170 );
1171 }
1172}