1use crate::{Entity, EntityType, ExtractionMethod, Model, Provenance, Result};
13
14#[derive(Debug, Clone)]
19pub struct HeuristicNER {
20 threshold: f64,
22}
23
24impl Default for HeuristicNER {
25 fn default() -> Self {
26 Self { threshold: 0.35 }
27 }
28}
29
30impl HeuristicNER {
31 #[must_use]
33 pub fn new() -> Self {
34 Self::default()
35 }
36
37 #[must_use]
39 pub fn with_threshold(threshold: f64) -> Self {
40 Self { threshold }
41 }
42}
43
44const ORG_SUFFIX: &[&str] = &[
46 "inc.",
47 "inc",
48 "corp.",
49 "corp",
50 "ltd.",
51 "ltd",
52 "llc",
53 "co.",
54 "plc",
55 "foundation",
56 "institute",
57 "university",
58 "college",
59 "bank",
60 "group",
61 "agency",
62 "gmbh",
64 "ag",
65 "kg",
66 "sa",
67 "s.a.",
68 "s.l.",
69 "s.r.l.",
70 "spa",
71 "nv",
72 "bv",
73 "pty",
74 "ab",
75 "limited",
76 "corporation",
77 "incorporated",
78 "company",
79 "holding",
80 "holdings",
81];
82const PERSON_PREFIX: &[&str] = &[
83 "mr.", "mr", "ms.", "ms", "mrs.", "mrs", "dr.", "dr", "prof.", "prof",
84];
85const LOC_PREPOSITION: &[&str] = &[
86 "in", "from", "at", "to", "near", "aus", "nach", "bei", "von", "en", "de", "à", "dans", "por", "sur",
89];
90#[allow(dead_code)] const SKIP_WORDS: &[&str] = &[
93 "ceo",
94 "cto",
95 "cfo",
96 "vp",
97 "president",
98 "chairman",
99 "director",
100];
101
102const COMMON_SENTENCE_STARTERS: &[&str] = &[
104 "the",
105 "a",
106 "an",
107 "this",
108 "that",
109 "these",
110 "those",
111 "it",
112 "he",
113 "she",
114 "we",
115 "they",
116 "in",
117 "on",
118 "at",
119 "to",
120 "for",
121 "from",
122 "by",
123 "with",
124 "and",
125 "but",
126 "or",
127 "so",
128 "yet",
129 "if",
130 "because",
131 "contact",
132 "call",
133 "email",
134 "visit",
135 "please",
136 "see",
137 "note",
138 "today",
139 "yesterday",
140 "tomorrow",
141 "now",
142 "then",
143 "what",
144 "where",
145 "when",
146 "who",
147 "why",
148 "how",
149 "is",
150 "are",
151 "was",
152 "were",
153 "be",
154 "been",
155 "have",
156 "has",
157 "had",
158];
159
160#[allow(dead_code)] const KNOWN_ORGS: &[&str] = &[
164 "google",
165 "apple",
166 "microsoft",
167 "amazon",
168 "facebook",
169 "meta",
170 "tesla",
171 "twitter",
172 "ibm",
173 "intel",
174 "nvidia",
175 "oracle",
176 "cisco",
177 "samsung",
178 "sony",
179 "toyota",
180 "honda",
181 "bmw",
182 "mercedes",
183 "volkswagen",
184 "nasa",
185 "fbi",
186 "cia",
187 "nsa",
188 "nato",
189 "un",
190 "eu",
191 "bbc",
192 "cnn",
193 "nbc",
194 "cbs",
195 "abc",
196 "fox",
197 "nyt",
198 "wsj",
199 "reuters",
200 "bloomberg",
201 "spotify",
202 "netflix",
203 "uber",
204 "airbnb",
205 "paypal",
206 "visa",
207 "mastercard",
208 "amex",
209 "ソニー",
211 "トヨタ",
212 "ホンダ",
213 "任天堂",
214 "サムスン",
215 "ファーウェイ",
216 "アリババ",
217 "テンセント",
218 "华为",
219 "阿里巴巴",
220 "腾讯",
221 "百度",
222 "小米",
223];
224
225#[allow(dead_code)] const KNOWN_LOCS: &[&str] = &[
227 "paris",
228 "london",
229 "tokyo",
230 "berlin",
231 "rome",
232 "madrid",
233 "moscow",
234 "beijing",
235 "shanghai",
236 "dubai",
237 "singapore",
238 "sydney",
239 "toronto",
240 "chicago",
241 "boston",
242 "california",
243 "texas",
244 "florida",
245 "new york",
246 "washington",
247 "europe",
248 "asia",
249 "africa",
250 "america",
251 "australia",
252 "china",
253 "india",
254 "japan",
255 "germany",
256 "france",
257 "italy",
258 "spain",
259 "brazil",
260 "mexico",
261 "russia",
262 "korea",
263 "canada",
264 "uk",
265 "usa",
266 "東京",
268 "大阪",
269 "京都",
270 "北京",
271 "上海",
272 "香港",
273 "ソウル",
274 "台北",
275 "中国",
276 "日本",
277 "韓国",
278 "アメリカ",
279 "イギリス",
280 "フランス",
281 "ドイツ",
282];
283
284#[allow(dead_code)] const KNOWN_PERSONS: &[&str] = &[
286 "john", "jane", "mary", "james", "robert", "michael", "william", "david", "richard", "joseph",
287 "thomas", "charles", "barack", "donald", "joe", "george", "bill", "vladimir", "emmanuel",
288 "boris", "narendra", "justin", "elon", "jeff", "mark", "steve", "tim", "satya", "sundar",
289 "albert", "isaac", "stephen", "neil", "peter", "paul", "matthew", "andrew", "philip", "simon",
290 "marie", "angela", "hillary", "nancy", "kamala", "michelle", "melania", "jill", "theresa",
291 "ursula",
292];
293
294impl Model for HeuristicNER {
295 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
296 if text.is_empty() {
297 return Ok(vec![]);
298 }
299
300 let mut entities: Vec<Entity> = Vec::new();
301
302 let has_cjk = text.chars().any(
305 |c| {
306 ('\u{4e00}'..='\u{9fff}').contains(&c) || ('\u{3040}'..='\u{309f}').contains(&c) || ('\u{30a0}'..='\u{30ff}').contains(&c)
309 }, );
311
312 if has_cjk {
313 let converter = crate::offset::SpanConverter::new(text);
316
317 for &org in KNOWN_ORGS {
318 if org.chars().any(|c| c >= '\u{3040}') {
320 let org_char_count = if org.is_ascii() {
321 org.len()
322 } else {
323 org.chars().count()
324 };
325
326 for (start_byte, _) in text.match_indices(org) {
329 let char_start = converter.byte_to_char(start_byte);
330 let char_end = char_start + org_char_count;
331 if !entities
333 .iter()
334 .any(|e| e.start == char_start && e.end == char_end)
335 {
336 entities.push(Entity::new(
337 org.to_string(),
338 EntityType::Organization,
339 char_start,
340 char_end,
341 0.9,
342 ));
343 }
344 }
345 }
346 }
347 for &loc in KNOWN_LOCS {
348 if loc.chars().any(|c| c >= '\u{3040}') {
349 let loc_char_count = if loc.is_ascii() {
350 loc.len()
351 } else {
352 loc.chars().count()
353 };
354
355 for (start_byte, _) in text.match_indices(loc) {
357 let char_start = converter.byte_to_char(start_byte);
358 let char_end = char_start + loc_char_count;
359 if !entities
360 .iter()
361 .any(|e| e.start == char_start && e.end == char_end)
362 {
363 entities.push(Entity::new(
364 loc.to_string(),
365 EntityType::Location,
366 char_start,
367 char_end,
368 0.9,
369 ));
370 }
371 }
372 }
373 }
374 }
375
376 let mut words_with_pos: Vec<(&str, usize, usize)> = Vec::new();
380
381 let mut in_word = false;
382 let mut word_start_byte = 0;
383 let mut word_start_char = 0;
384 let mut char_pos = 0;
385
386 for (i, c) in text.char_indices() {
387 if c.is_whitespace() {
388 if in_word {
389 let word = &text[word_start_byte..i];
391 words_with_pos.push((word, word_start_char, char_pos));
392 in_word = false;
393 }
394 } else if !in_word {
395 in_word = true;
396 word_start_byte = i;
397 word_start_char = char_pos;
398 }
399 char_pos += 1;
400 }
401 if in_word {
403 let word = &text[word_start_byte..];
404 words_with_pos.push((word, word_start_char, char_pos));
405 }
406
407 let words: Vec<&str> = words_with_pos.iter().map(|(w, _, _)| *w).collect();
408
409 let mut i = 0;
410 while i < words.len() {
411 let word = words[i];
412
413 let clean_leading = word.trim_start_matches(|c: char| !c.is_alphanumeric());
415 if clean_leading.is_empty() {
416 i += 1;
417 continue;
418 }
419
420 if !clean_leading
422 .chars()
423 .next()
424 .map(|c| c.is_uppercase())
425 .unwrap_or(false)
426 {
427 i += 1;
428 continue;
429 }
430
431 let start_idx = i;
434
435 let first_word_lower = word.to_lowercase();
437 let first_word_clean = first_word_lower.trim_matches(|c: char| !c.is_alphanumeric());
438 if COMMON_SENTENCE_STARTERS.contains(&first_word_clean) {
439 i += 1;
440 continue;
441 }
442
443 while i < words.len() {
444 let w = words[i];
445 let w_clean = w.trim_start_matches(|c: char| !c.is_alphanumeric());
446
447 let ends_with_closing = w.ends_with([')', ']', '}']);
452 let ends_with_punct = w.ends_with(['.', '!', '?']);
453
454 let first_char_upper = w_clean
455 .chars()
456 .next()
457 .map(|c| c.is_uppercase())
458 .unwrap_or(false);
459
460 let is_connector = matches!(w.to_lowercase().as_str(), "of" | "the");
463
464 let next_word_ok = if i + 1 < words.len() {
466 let next = words[i + 1];
467 let next_clean = next.trim_start_matches(|c: char| !c.is_alphanumeric());
468 let next_upper = next_clean
469 .chars()
470 .next()
471 .map(|c| c.is_uppercase())
472 .unwrap_or(false);
473
474 let is_suffix = ORG_SUFFIX.contains(&&*next_clean.to_lowercase());
477
478 if (ends_with_closing || ends_with_punct) && !is_suffix {
479 false } else {
481 next_upper
482 }
483 } else {
484 false
485 };
486
487 if first_char_upper || (is_connector && next_word_ok) {
488 i += 1;
489 if ends_with_closing || ends_with_punct {
492 let is_suffix_next = if let Some(next_w) = words.get(i) {
493 let clean = next_w.to_lowercase();
494 let clean_ref = clean.trim_matches(|c: char| !c.is_alphanumeric());
495 ORG_SUFFIX.contains(&clean_ref)
496 } else {
497 false
498 };
499
500 if !is_suffix_next {
501 break;
502 }
503 }
504 } else {
505 break;
506 }
507 }
508 let end_idx = i;
509
510 if start_idx == end_idx {
511 continue;
512 }
513
514 let span_words = &words[start_idx..end_idx];
516 let mut entity_text = span_words.join(" ");
517
518 let prev_word = if start_idx > 0 {
520 Some(
521 words[start_idx - 1]
522 .to_lowercase()
523 .trim_end_matches('.')
524 .to_string(),
525 )
526 } else {
527 None
528 };
529 let should_include_prefix = prev_word
530 .as_ref()
531 .map(|p| PERSON_PREFIX.contains(&p.as_str()))
532 .unwrap_or(false);
533
534 if should_include_prefix {
536 let prefix_word = &words[start_idx - 1];
537 entity_text = format!("{} {}", prefix_word, entity_text);
538 let prefix_char_start = words_with_pos[start_idx - 1].1;
540 let char_start = prefix_char_start;
541 let char_end = char_start + entity_text.chars().count();
542
543 let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
545 let (entity_type, confidence, reason) =
546 classify_minimal(&clean_span_words, &words, start_idx - 1);
547
548 if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
550 entities.push(Entity::with_provenance(
551 entity_text,
552 entity_type,
553 char_start,
554 char_end,
555 confidence,
556 Provenance {
557 source: "heuristic".into(),
558 method: ExtractionMethod::Heuristic,
559 pattern: Some(reason.into()),
560 raw_confidence: Some(confidence),
561 model_version: None,
562 timestamp: None,
563 },
564 ));
565 }
566 continue; }
568
569 let leading_punct_len = entity_text.len()
571 - entity_text
572 .trim_start_matches(|c: char| !c.is_alphanumeric())
573 .len();
574 if leading_punct_len > 0 {
575 entity_text = entity_text[leading_punct_len..].to_string();
576 }
577
578 while entity_text.ends_with(|c: char| !c.is_alphanumeric()) {
580 entity_text.pop();
581 }
582
583 if entity_text.is_empty() {
585 continue;
586 }
587
588 let char_start = words_with_pos[start_idx].1 + leading_punct_len;
591 let char_end = char_start
593 + if entity_text.is_ascii() {
594 entity_text.len()
595 } else {
596 entity_text.chars().count()
597 };
598
599 let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
602 let (entity_type, confidence, reason) =
603 classify_minimal(&clean_span_words, &words, start_idx);
604
605 if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
607 entities.push(Entity::with_provenance(
608 entity_text,
609 entity_type,
610 char_start,
611 char_end,
612 confidence,
613 Provenance {
614 source: "heuristic".into(),
615 method: ExtractionMethod::Heuristic,
616 pattern: Some(reason.into()),
617 raw_confidence: Some(confidence),
618 model_version: None,
619 timestamp: None,
620 },
621 ));
622 }
623 }
624
625 Ok(entities)
626 }
627
628 fn supported_types(&self) -> Vec<EntityType> {
629 vec![
630 EntityType::Person,
631 EntityType::Organization,
632 EntityType::Location,
633 ]
634 }
635
636 fn is_available(&self) -> bool {
637 true
638 }
639
640 fn name(&self) -> &'static str {
641 "heuristic"
642 }
643
644 fn description(&self) -> &'static str {
645 "Heuristic NER optimized for low complexity"
646 }
647
648 fn capabilities(&self) -> crate::ModelCapabilities {
649 crate::ModelCapabilities {
650 batch_capable: true,
651 streaming_capable: true,
652 ..Default::default()
653 }
654 }
655}
656
657fn is_acronym_word(w: &str) -> bool {
662 let clean = w.trim_matches(|c: char| !c.is_alphanumeric());
663 let alpha_count = clean.chars().filter(|c| c.is_alphabetic()).count();
664 alpha_count >= 2
665 && clean
666 .chars()
667 .filter(|c| c.is_alphabetic())
668 .all(|c| c.is_uppercase())
669}
670
671fn classify_minimal(
672 span: &[&str],
673 all_words: &[&str],
674 start_idx: usize,
675) -> (EntityType, f64, &'static str) {
676 let last_word = span.last().map(|s| s.to_lowercase()).unwrap_or_default();
677 let first_word = span.first().map(|s| s.to_lowercase()).unwrap_or_default();
678 let span_lower = span
679 .iter()
680 .map(|s| s.to_lowercase())
681 .collect::<Vec<_>>()
682 .join(" ");
683
684 let prev_word = if start_idx > 0 {
686 Some(all_words[start_idx - 1].to_lowercase())
687 } else {
688 None
689 };
690
691 let skip_pronouns = [
693 "the", "a", "an", "he", "she", "it", "they", "we", "i", "you",
694 ];
695 if span.len() == 1 && skip_pronouns.contains(&first_word.as_str()) {
696 return (EntityType::Other("skip".into()), 0.0, "skip_pronoun");
697 }
698 let first_clean_lc = first_word
700 .trim_end_matches(|c: char| !c.is_alphanumeric())
701 .to_lowercase();
702 if span.len() == 1 && SKIP_WORDS.contains(&first_clean_lc.as_str()) {
703 return (EntityType::Other("skip".into()), 0.0, "skip_word");
704 }
705
706 let last_clean: &str = last_word.trim_end_matches(|c: char| !c.is_alphanumeric());
708 if ORG_SUFFIX.contains(&last_clean) {
709 return (EntityType::Organization, 0.85, "org_suffix");
710 }
711
712 let first_clean_text = first_word.trim_end_matches(|c: char| !c.is_alphanumeric());
714 if KNOWN_ORGS.contains(&first_clean_text) || KNOWN_ORGS.contains(&span_lower.as_str()) {
715 return (EntityType::Organization, 0.80, "known_org");
716 }
717
718 if KNOWN_LOCS.contains(&first_clean_text) || KNOWN_LOCS.contains(&span_lower.as_str()) {
720 return (EntityType::Location, 0.80, "known_location");
721 }
722
723 if KNOWN_PERSONS.contains(&first_clean_text) {
725 return (EntityType::Person, 0.75, "common_name");
726 }
727
728 if let Some(prev) = &prev_word {
730 let prev_clean: &str = prev.trim_end_matches('.');
731 if PERSON_PREFIX.contains(&prev_clean) {
732 return (EntityType::Person, 0.80, "person_prefix_context");
733 }
734 }
735
736 let first_clean: &str = first_word.trim_end_matches('.');
738 if PERSON_PREFIX.contains(&first_clean) && span.len() >= 2 {
739 return (EntityType::Person, 0.75, "person_prefix_span");
740 }
741
742 if span.len() >= 2 {
745 let has_real_acronym = span.iter().any(|w| {
746 is_acronym_word(w) && {
747 let lc = w.to_lowercase();
748 !SKIP_WORDS.contains(&lc.trim_matches(|c: char| !c.is_alphanumeric()))
749 }
750 });
751 if has_real_acronym {
752 return (EntityType::Organization, 0.70, "acronym_in_span");
753 }
754 }
755
756 if let Some(prev) = &prev_word {
758 if LOC_PREPOSITION.contains(&prev.as_str()) {
759 return (EntityType::Location, 0.70, "loc_context");
760 }
761 }
762
763 if span.len() == 2 {
766 let place_indicators = ["united", "new", "south", "north", "west", "east", "great"];
767 if place_indicators.contains(&first_word.as_str()) {
768 return (EntityType::Location, 0.65, "loc_indicator");
769 }
770 return (EntityType::Person, 0.60, "two_word_name");
771 }
772
773 if span.len() >= 3 {
775 if span.len() >= 2 && span[1].to_lowercase() == "of" {
777 return (EntityType::Organization, 0.65, "org_of_pattern");
778 }
779 return (EntityType::Organization, 0.50, "long_span_org");
780 }
781
782 if span.len() == 1 {
784 let word = span[0].trim_matches(|c: char| !c.is_alphanumeric());
785 if word.len() == 1 {
786 return (EntityType::Other("skip".into()), 0.0, "single_letter");
787 }
788 if is_acronym_word(word) {
789 let lc = word.to_lowercase();
790 if !SKIP_WORDS.contains(&lc.as_str()) {
791 return (EntityType::Organization, 0.55, "single_acronym");
792 }
793 }
794 }
795
796 if start_idx == 0 && prev_word.is_none() {
798 return (EntityType::Person, 0.30, "single_start_word");
799 }
800
801 (EntityType::Person, 0.45, "capitalized")
803}
804
805impl crate::NamedEntityCapable for HeuristicNER {}
806
807impl crate::BatchCapable for HeuristicNER {
812 fn optimal_batch_size(&self) -> Option<usize> {
813 Some(16) }
815}
816
817impl crate::StreamingCapable for HeuristicNER {
822 fn recommended_chunk_size(&self) -> usize {
823 8192 }
825}
826
827#[cfg(test)]
828mod tests;