anno/backends/
heuristic.rs

1//! Heuristic NER - optimized for low Kolmogorov complexity
2//!
3//! A heuristic-based NER model that achieves reasonable performance with minimal
4//! complexity. The goal is to minimize description length while maximizing
5//! downstream quality.
6//!
7//! Core principles:
8//! 1. Exploit structural signals (capitalization, punctuation) - "free" features
9//! 2. Use high-precision patterns (Inc., Dr., in/from) - small fixed cost
10//! 3. Avoid large lexicons - high description cost per marginal gain
11
12use crate::{Entity, EntityType, ExtractionMethod, Model, Provenance, Result};
13
14/// Heuristic NER model.
15///
16/// A heuristic-based NER model optimized for low Kolmogorov complexity.
17/// Uses high-precision patterns with minimal lexical resources.
18#[derive(Debug, Clone)]
19pub struct HeuristicNER {
20    /// Minimum confidence threshold for entity extraction.
21    threshold: f64,
22}
23
24impl Default for HeuristicNER {
25    fn default() -> Self {
26        Self { threshold: 0.35 }
27    }
28}
29
30impl HeuristicNER {
31    /// Create a new HeuristicNER instance with default threshold.
32    #[must_use]
33    pub fn new() -> Self {
34        Self::default()
35    }
36
37    /// Create a new HeuristicNER with a custom confidence threshold.
38    #[must_use]
39    pub fn with_threshold(threshold: f64) -> Self {
40        Self { threshold }
41    }
42}
43
44// High-precision patterns (small, fixed cost)
45const ORG_SUFFIX: &[&str] = &[
46    "inc.",
47    "inc",
48    "corp.",
49    "corp",
50    "ltd.",
51    "ltd",
52    "llc",
53    "co.",
54    "plc",
55    "foundation",
56    "institute",
57    "university",
58    "college",
59    "bank",
60    "group",
61    "agency",
62    // International suffixes
63    "gmbh",
64    "ag",
65    "kg",
66    "sa",
67    "s.a.",
68    "s.l.",
69    "s.r.l.",
70    "spa",
71    "nv",
72    "bv",
73    "pty",
74    "ab",
75    "limited",
76    "corporation",
77    "incorporated",
78    "company",
79    "holding",
80    "holdings",
81];
82const PERSON_PREFIX: &[&str] = &[
83    "mr.", "mr", "ms.", "ms", "mrs.", "mrs", "dr.", "dr", "prof.", "prof",
84];
85const LOC_PREPOSITION: &[&str] = &[
86    "in", "from", "at", "to", "near", // German
87    "aus", "nach", "bei", "von", // French/Spanish/Italian
88    "en", "de", "à", "dans", "por", "sur",
89];
90// Common words that look like entities but aren't (all caps acronyms, titles)
91#[allow(dead_code)] // Used in classify_minimal
92const SKIP_WORDS: &[&str] = &[
93    "ceo",
94    "cto",
95    "cfo",
96    "vp",
97    "president",
98    "chairman",
99    "director",
100];
101
102// Words that commonly start sentences but are not entities
103const COMMON_SENTENCE_STARTERS: &[&str] = &[
104    "the",
105    "a",
106    "an",
107    "this",
108    "that",
109    "these",
110    "those",
111    "it",
112    "he",
113    "she",
114    "we",
115    "they",
116    "in",
117    "on",
118    "at",
119    "to",
120    "for",
121    "from",
122    "by",
123    "with",
124    "and",
125    "but",
126    "or",
127    "so",
128    "yet",
129    "if",
130    "because",
131    "contact",
132    "call",
133    "email",
134    "visit",
135    "please",
136    "see",
137    "note",
138    "today",
139    "yesterday",
140    "tomorrow",
141    "now",
142    "then",
143    "what",
144    "where",
145    "when",
146    "who",
147    "why",
148    "how",
149    "is",
150    "are",
151    "was",
152    "were",
153    "be",
154    "been",
155    "have",
156    "has",
157    "had",
158];
159
160// Minimal lexical knowledge (50 items each - high ROI)
161// These are the most common entities that are hard to distinguish structurally
162#[allow(dead_code)] // Used in classify_minimal but compiler sometimes misses it
163const KNOWN_ORGS: &[&str] = &[
164    "google",
165    "apple",
166    "microsoft",
167    "amazon",
168    "facebook",
169    "meta",
170    "tesla",
171    "twitter",
172    "ibm",
173    "intel",
174    "nvidia",
175    "oracle",
176    "cisco",
177    "samsung",
178    "sony",
179    "toyota",
180    "honda",
181    "bmw",
182    "mercedes",
183    "volkswagen",
184    "nasa",
185    "fbi",
186    "cia",
187    "nsa",
188    "nato",
189    "un",
190    "eu",
191    "bbc",
192    "cnn",
193    "nbc",
194    "cbs",
195    "abc",
196    "fox",
197    "nyt",
198    "wsj",
199    "reuters",
200    "bloomberg",
201    "spotify",
202    "netflix",
203    "uber",
204    "airbnb",
205    "paypal",
206    "visa",
207    "mastercard",
208    "amex",
209    // CJK Orgs
210    "ソニー",
211    "トヨタ",
212    "ホンダ",
213    "任天堂",
214    "サムスン",
215    "ファーウェイ",
216    "アリババ",
217    "テンセント",
218    "华为",
219    "阿里巴巴",
220    "腾讯",
221    "百度",
222    "小米",
223];
224
225#[allow(dead_code)] // Used in classify_minimal but compiler sometimes misses it
226const KNOWN_LOCS: &[&str] = &[
227    "paris",
228    "london",
229    "tokyo",
230    "berlin",
231    "rome",
232    "madrid",
233    "moscow",
234    "beijing",
235    "shanghai",
236    "dubai",
237    "singapore",
238    "sydney",
239    "toronto",
240    "chicago",
241    "boston",
242    "california",
243    "texas",
244    "florida",
245    "new york",
246    "washington",
247    "europe",
248    "asia",
249    "africa",
250    "america",
251    "australia",
252    "china",
253    "india",
254    "japan",
255    "germany",
256    "france",
257    "italy",
258    "spain",
259    "brazil",
260    "mexico",
261    "russia",
262    "korea",
263    "canada",
264    "uk",
265    "usa",
266    // CJK Locs
267    "東京",
268    "大阪",
269    "京都",
270    "北京",
271    "上海",
272    "香港",
273    "ソウル",
274    "台北",
275    "中国",
276    "日本",
277    "韓国",
278    "アメリカ",
279    "イギリス",
280    "フランス",
281    "ドイツ",
282];
283
284#[allow(dead_code)] // Used in classify_minimal
285const KNOWN_PERSONS: &[&str] = &[
286    "john", "jane", "mary", "james", "robert", "michael", "william", "david", "richard", "joseph",
287    "thomas", "charles", "barack", "donald", "joe", "george", "bill", "vladimir", "emmanuel",
288    "boris", "narendra", "justin", "elon", "jeff", "mark", "steve", "tim", "satya", "sundar",
289    "albert", "isaac", "stephen", "neil", "peter", "paul", "matthew", "andrew", "philip", "simon",
290    "marie", "angela", "hillary", "nancy", "kamala", "michelle", "melania", "jill", "theresa",
291    "ursula",
292];
293
294impl Model for HeuristicNER {
295    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
296        if text.is_empty() {
297            return Ok(vec![]);
298        }
299
300        let mut entities: Vec<Entity> = Vec::new();
301
302        // CJK Detection & Extraction
303        // Since CJK doesn't use spaces, we scan for known entities directly
304        let has_cjk = text.chars().any(
305            |c| {
306                ('\u{4e00}'..='\u{9fff}').contains(&c) || // CJK Unified Ideographs
307            ('\u{3040}'..='\u{309f}').contains(&c) || // Hiragana
308            ('\u{30a0}'..='\u{30ff}').contains(&c)
309            }, // Katakana
310        );
311
312        if has_cjk {
313            // Performance: Build byte-to-char mapping once for this text.
314            // ROI: High - reused across all CJK substring matches.
315            let converter = crate::offset::SpanConverter::new(text);
316
317            for &org in KNOWN_ORGS {
318                // Simple substring search for CJK terms
319                if org.chars().any(|c| c >= '\u{3040}') {
320                    let org_char_count = if org.is_ascii() {
321                        org.len()
322                    } else {
323                        org.chars().count()
324                    };
325
326                    // Include Hiragana/Katakana
327                    // Standard library substring search - efficient for typical NER workloads
328                    for (start_byte, _) in text.match_indices(org) {
329                        let char_start = converter.byte_to_char(start_byte);
330                        let char_end = char_start + org_char_count;
331                        // Avoid duplicates if already found (simple overlap check)
332                        if !entities
333                            .iter()
334                            .any(|e| e.start == char_start && e.end == char_end)
335                        {
336                            entities.push(Entity::new(
337                                org.to_string(),
338                                EntityType::Organization,
339                                char_start,
340                                char_end,
341                                0.9,
342                            ));
343                        }
344                    }
345                }
346            }
347            for &loc in KNOWN_LOCS {
348                if loc.chars().any(|c| c >= '\u{3040}') {
349                    let loc_char_count = if loc.is_ascii() {
350                        loc.len()
351                    } else {
352                        loc.chars().count()
353                    };
354
355                    // Standard library substring search - efficient for typical NER workloads
356                    for (start_byte, _) in text.match_indices(loc) {
357                        let char_start = converter.byte_to_char(start_byte);
358                        let char_end = char_start + loc_char_count;
359                        if !entities
360                            .iter()
361                            .any(|e| e.start == char_start && e.end == char_end)
362                        {
363                            entities.push(Entity::new(
364                                loc.to_string(),
365                                EntityType::Location,
366                                char_start,
367                                char_end,
368                                0.9,
369                            ));
370                        }
371                    }
372                }
373            }
374        }
375
376        // Build word list with character positions
377        // Robust strategy: Scan text linearly, identifying word boundaries.
378        // This avoids synchronization issues between split_whitespace and find.
379        let mut words_with_pos: Vec<(&str, usize, usize)> = Vec::new();
380
381        let mut in_word = false;
382        let mut word_start_byte = 0;
383        let mut word_start_char = 0;
384        let mut char_pos = 0;
385
386        for (i, c) in text.char_indices() {
387            if c.is_whitespace() {
388                if in_word {
389                    // Word ended
390                    let word = &text[word_start_byte..i];
391                    words_with_pos.push((word, word_start_char, char_pos));
392                    in_word = false;
393                }
394            } else if !in_word {
395                in_word = true;
396                word_start_byte = i;
397                word_start_char = char_pos;
398            }
399            char_pos += 1;
400        }
401        // Last word
402        if in_word {
403            let word = &text[word_start_byte..];
404            words_with_pos.push((word, word_start_char, char_pos));
405        }
406
407        let words: Vec<&str> = words_with_pos.iter().map(|(w, _, _)| *w).collect();
408
409        let mut i = 0;
410        while i < words.len() {
411            let word = words[i];
412
413            // Pre-check: Clean leading punctuation before checking capitalization
414            let clean_leading = word.trim_start_matches(|c: char| !c.is_alphanumeric());
415            if clean_leading.is_empty() {
416                i += 1;
417                continue;
418            }
419
420            // Only consider capitalized words as candidates
421            if !clean_leading
422                .chars()
423                .next()
424                .map(|c| c.is_uppercase())
425                .unwrap_or(false)
426            {
427                i += 1;
428                continue;
429            }
430
431            // Find span of consecutive capitalized words
432            // Only allow "of" and "the" as connectors (not "and" which separates entities)
433            let start_idx = i;
434
435            // Filter: Skip common sentence starters if this is the first word of a span
436            let first_word_lower = word.to_lowercase();
437            let first_word_clean = first_word_lower.trim_matches(|c: char| !c.is_alphanumeric());
438            if COMMON_SENTENCE_STARTERS.contains(&first_word_clean) {
439                i += 1;
440                continue;
441            }
442
443            while i < words.len() {
444                let w = words[i];
445                let w_clean = w.trim_start_matches(|c: char| !c.is_alphanumeric());
446
447                // Check if word ends with closing parenthesis - implies end of group
448                // Also check for sentence boundaries (. ! ?) unless it's a known suffix like Inc. or Mr.
449                // NOTE: We assume '.' inside a word (e.g. U.S.A.) is fine, but '.' at end is boundary.
450                // Unless next word is lower case (abbreviation).
451                let ends_with_closing = w.ends_with([')', ']', '}']);
452                let ends_with_punct = w.ends_with(['.', '!', '?']);
453
454                let first_char_upper = w_clean
455                    .chars()
456                    .next()
457                    .map(|c| c.is_uppercase())
458                    .unwrap_or(false);
459
460                // Only "of" and "the" connect entity names (e.g., "Bank of America", "The New York Times")
461                // "and" separates entities (e.g., "Paris and London" are two entities)
462                let is_connector = matches!(w.to_lowercase().as_str(), "of" | "the");
463
464                // Check next word
465                let next_word_ok = if i + 1 < words.len() {
466                    let next = words[i + 1];
467                    let next_clean = next.trim_start_matches(|c: char| !c.is_alphanumeric());
468                    let next_upper = next_clean
469                        .chars()
470                        .next()
471                        .map(|c| c.is_uppercase())
472                        .unwrap_or(false);
473
474                    // Special case: "Inc", "Corp" etc can follow a closing parenthesis
475                    // e.g. "Google) Inc" -> merged
476                    let is_suffix = ORG_SUFFIX.contains(&&*next_clean.to_lowercase());
477
478                    if (ends_with_closing || ends_with_punct) && !is_suffix {
479                        false // Break span at closing parenthesis/punctuation unless followed by suffix
480                    } else {
481                        next_upper
482                    }
483                } else {
484                    false
485                };
486
487                if first_char_upper || (is_connector && next_word_ok) {
488                    i += 1;
489                    // If this word ended with closing parenthesis, and we didn't break above (because next is suffix),
490                    // continue. If we broke above, loop terminates.
491                    if ends_with_closing || ends_with_punct {
492                        let is_suffix_next = if let Some(next_w) = words.get(i) {
493                            let clean = next_w.to_lowercase();
494                            let clean_ref = clean.trim_matches(|c: char| !c.is_alphanumeric());
495                            ORG_SUFFIX.contains(&clean_ref)
496                        } else {
497                            false
498                        };
499
500                        if !is_suffix_next {
501                            break;
502                        }
503                    }
504                } else {
505                    break;
506                }
507            }
508            let end_idx = i;
509
510            if start_idx == end_idx {
511                continue;
512            }
513
514            // Extract the span
515            let span_words = &words[start_idx..end_idx];
516            let mut entity_text = span_words.join(" ");
517
518            // Check if previous word is a person prefix (e.g., "Dr.", "Mr.")
519            let prev_word = if start_idx > 0 {
520                Some(
521                    words[start_idx - 1]
522                        .to_lowercase()
523                        .trim_end_matches('.')
524                        .to_string(),
525                )
526            } else {
527                None
528            };
529            let should_include_prefix = prev_word
530                .as_ref()
531                .map(|p| PERSON_PREFIX.contains(&p.as_str()))
532                .unwrap_or(false);
533
534            // If previous word is a person prefix, include it in the entity text
535            if should_include_prefix {
536                let prefix_word = &words[start_idx - 1];
537                entity_text = format!("{} {}", prefix_word, entity_text);
538                // Adjust start position to include prefix
539                let prefix_char_start = words_with_pos[start_idx - 1].1;
540                let char_start = prefix_char_start;
541                let char_end = char_start + entity_text.chars().count();
542
543                // Classify based on minimal rules
544                let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
545                let (entity_type, confidence, reason) =
546                    classify_minimal(&clean_span_words, &words, start_idx - 1);
547
548                // Skip low-confidence and filtered entities
549                if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
550                    entities.push(Entity::with_provenance(
551                        entity_text,
552                        entity_type,
553                        char_start,
554                        char_end,
555                        confidence,
556                        Provenance {
557                            source: "heuristic".into(),
558                            method: ExtractionMethod::Heuristic,
559                            pattern: Some(reason.into()),
560                            raw_confidence: Some(confidence),
561                            model_version: None,
562                            timestamp: None,
563                        },
564                    ));
565                }
566                continue; // Skip the normal processing below
567            }
568
569            // Clean leading punctuation from first word (but not person prefixes)
570            let leading_punct_len = entity_text.len()
571                - entity_text
572                    .trim_start_matches(|c: char| !c.is_alphanumeric())
573                    .len();
574            if leading_punct_len > 0 {
575                entity_text = entity_text[leading_punct_len..].to_string();
576            }
577
578            // Clean trailing punctuation from the last word
579            while entity_text.ends_with(|c: char| !c.is_alphanumeric()) {
580                entity_text.pop();
581            }
582
583            // Skip if entity became empty after cleaning
584            if entity_text.is_empty() {
585                continue;
586            }
587
588            // Get character offsets from our position tracking
589            // Correct start offset by adding leading punctuation length
590            let char_start = words_with_pos[start_idx].1 + leading_punct_len;
591            // Performance: Use entity_text.len() for ASCII, fallback to chars().count() for Unicode
592            let char_end = char_start
593                + if entity_text.is_ascii() {
594                    entity_text.len()
595                } else {
596                    entity_text.chars().count()
597                };
598
599            // Classify based on minimal rules
600            // Use cleaned span for classification to avoid punctuation noise
601            let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
602            let (entity_type, confidence, reason) =
603                classify_minimal(&clean_span_words, &words, start_idx);
604
605            // Skip low-confidence and filtered entities
606            if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
607                entities.push(Entity::with_provenance(
608                    entity_text,
609                    entity_type,
610                    char_start,
611                    char_end,
612                    confidence,
613                    Provenance {
614                        source: "heuristic".into(),
615                        method: ExtractionMethod::Heuristic,
616                        pattern: Some(reason.into()),
617                        raw_confidence: Some(confidence),
618                        model_version: None,
619                        timestamp: None,
620                    },
621                ));
622            }
623        }
624
625        Ok(entities)
626    }
627
628    fn supported_types(&self) -> Vec<EntityType> {
629        vec![
630            EntityType::Person,
631            EntityType::Organization,
632            EntityType::Location,
633        ]
634    }
635
636    fn is_available(&self) -> bool {
637        true
638    }
639
640    fn name(&self) -> &'static str {
641        "heuristic"
642    }
643
644    fn description(&self) -> &'static str {
645        "Heuristic NER optimized for low complexity"
646    }
647}
648
649/// Domain-agnostic, language-agnostic acronym check.  True when every
650/// alphabetic character is uppercase and there are at least 2 alphabetic
651/// characters.  Unicode-aware: works for Latin (NASA), Cyrillic (НАТО),
652/// and gracefully returns false for caseless scripts (CJK, Arabic).
653fn is_acronym_word(w: &str) -> bool {
654    let clean = w.trim_matches(|c: char| !c.is_alphanumeric());
655    let alpha_count = clean.chars().filter(|c| c.is_alphabetic()).count();
656    alpha_count >= 2
657        && clean
658            .chars()
659            .filter(|c| c.is_alphabetic())
660            .all(|c| c.is_uppercase())
661}
662
663fn classify_minimal(
664    span: &[&str],
665    all_words: &[&str],
666    start_idx: usize,
667) -> (EntityType, f64, &'static str) {
668    let last_word = span.last().map(|s| s.to_lowercase()).unwrap_or_default();
669    let first_word = span.first().map(|s| s.to_lowercase()).unwrap_or_default();
670    let span_lower = span
671        .iter()
672        .map(|s| s.to_lowercase())
673        .collect::<Vec<_>>()
674        .join(" ");
675
676    // Get context
677    let prev_word = if start_idx > 0 {
678        Some(all_words[start_idx - 1].to_lowercase())
679    } else {
680        None
681    };
682
683    // Filter: Skip common pronouns/articles/titles that get capitalized
684    let skip_pronouns = [
685        "the", "a", "an", "he", "she", "it", "they", "we", "i", "you",
686    ];
687    if span.len() == 1 && skip_pronouns.contains(&first_word.as_str()) {
688        return (EntityType::Other("skip".into()), 0.0, "skip_pronoun");
689    }
690    // Filter: Skip job titles and common non-entity nouns
691    let first_clean_lc = first_word
692        .trim_end_matches(|c: char| !c.is_alphanumeric())
693        .to_lowercase();
694    if span.len() == 1 && SKIP_WORDS.contains(&first_clean_lc.as_str()) {
695        return (EntityType::Other("skip".into()), 0.0, "skip_word");
696    }
697
698    // Rule 1: ORG suffix (highest precision)
699    let last_clean: &str = last_word.trim_end_matches(|c: char| !c.is_alphanumeric());
700    if ORG_SUFFIX.contains(&last_clean) {
701        return (EntityType::Organization, 0.85, "org_suffix");
702    }
703
704    // Rule 2: Known organization name
705    let first_clean_text = first_word.trim_end_matches(|c: char| !c.is_alphanumeric());
706    if KNOWN_ORGS.contains(&first_clean_text) || KNOWN_ORGS.contains(&span_lower.as_str()) {
707        return (EntityType::Organization, 0.80, "known_org");
708    }
709
710    // Rule 3: Known location name
711    if KNOWN_LOCS.contains(&first_clean_text) || KNOWN_LOCS.contains(&span_lower.as_str()) {
712        return (EntityType::Location, 0.80, "known_location");
713    }
714
715    // Rule 3.5: Known person name
716    if KNOWN_PERSONS.contains(&first_clean_text) {
717        return (EntityType::Person, 0.75, "common_name");
718    }
719
720    // Rule 4: Person prefix in previous word
721    if let Some(prev) = &prev_word {
722        let prev_clean: &str = prev.trim_end_matches('.');
723        if PERSON_PREFIX.contains(&prev_clean) {
724            return (EntityType::Person, 0.80, "person_prefix_context");
725        }
726    }
727
728    // Rule 5: First word is a title -> Person
729    let first_clean: &str = first_word.trim_end_matches('.');
730    if PERSON_PREFIX.contains(&first_clean) && span.len() >= 2 {
731        return (EntityType::Person, 0.75, "person_prefix_span");
732    }
733
734    // Rule 5.5: Acronym signal (domain-agnostic, language-agnostic).
735    // Excludes SKIP_WORDS (CEO/CTO/VP) which are role titles, not entities.
736    if span.len() >= 2 {
737        let has_real_acronym = span.iter().any(|w| {
738            is_acronym_word(w) && {
739                let lc = w.to_lowercase();
740                !SKIP_WORDS.contains(&lc.trim_matches(|c: char| !c.is_alphanumeric()))
741            }
742        });
743        if has_real_acronym {
744            return (EntityType::Organization, 0.70, "acronym_in_span");
745        }
746    }
747
748    // Rule 6: Location preposition context
749    if let Some(prev) = &prev_word {
750        if LOC_PREPOSITION.contains(&prev.as_str()) {
751            return (EntityType::Location, 0.70, "loc_context");
752        }
753    }
754
755    // Rule 7: Two capitalized words (likely person name)
756    // Unless it looks like a country/place (contains "United", "New", etc.)
757    if span.len() == 2 {
758        let place_indicators = ["united", "new", "south", "north", "west", "east", "great"];
759        if place_indicators.contains(&first_word.as_str()) {
760            return (EntityType::Location, 0.65, "loc_indicator");
761        }
762        return (EntityType::Person, 0.60, "two_word_name");
763    }
764
765    // Rule 8: Three+ words -> likely ORG or LOC, not PER
766    if span.len() >= 3 {
767        // "Bank of X" pattern -> ORG
768        if span.len() >= 2 && span[1].to_lowercase() == "of" {
769            return (EntityType::Organization, 0.65, "org_of_pattern");
770        }
771        return (EntityType::Organization, 0.50, "long_span_org");
772    }
773
774    // Rule 9: Single-word structural signals
775    if span.len() == 1 {
776        let word = span[0].trim_matches(|c: char| !c.is_alphanumeric());
777        if word.len() == 1 {
778            return (EntityType::Other("skip".into()), 0.0, "single_letter");
779        }
780        if is_acronym_word(word) {
781            let lc = word.to_lowercase();
782            if !SKIP_WORDS.contains(&lc.as_str()) {
783                return (EntityType::Organization, 0.55, "single_acronym");
784            }
785        }
786    }
787
788    // Rule 10: Single word at sentence start with no context - very low confidence
789    if start_idx == 0 && prev_word.is_none() {
790        return (EntityType::Person, 0.30, "single_start_word");
791    }
792
793    // Default: single capitalized word mid-sentence - assume Person
794    (EntityType::Person, 0.45, "capitalized")
795}
796
797impl crate::NamedEntityCapable for HeuristicNER {}
798
799// =============================================================================
800// BatchCapable Trait Implementation
801// =============================================================================
802
803impl crate::BatchCapable for HeuristicNER {
804    fn optimal_batch_size(&self) -> Option<usize> {
805        Some(16) // HeuristicNER is fast, can handle larger batches
806    }
807}
808
809// =============================================================================
810// StreamingCapable Trait Implementation
811// =============================================================================
812
813impl crate::StreamingCapable for HeuristicNER {
814    fn recommended_chunk_size(&self) -> usize {
815        8192 // Characters - heuristics are lightweight
816    }
817}
818
819#[cfg(test)]
820mod tests {
821    use super::*;
822
823    #[test]
824    fn test_basic_person_detection() {
825        let ner = HeuristicNER::new();
826        let entities = ner
827            .extract_entities("Dr. John Smith met with Mary.", None)
828            .unwrap();
829
830        let names: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
831        assert!(
832            names
833                .iter()
834                .any(|n| n.contains("John") || n.contains("Smith")),
835            "Should detect John Smith: {:?}",
836            names
837        );
838    }
839
840    #[test]
841    fn test_organization_suffix_detection() {
842        let ner = HeuristicNER::new();
843        let entities = ner
844            .extract_entities("Apple Inc. announced new products.", None)
845            .unwrap();
846
847        let orgs: Vec<_> = entities
848            .iter()
849            .filter(|e| matches!(e.entity_type, EntityType::Organization))
850            .collect();
851        assert!(!orgs.is_empty(), "Should detect Apple Inc. as organization");
852    }
853
854    #[test]
855    fn test_location_preposition_context() {
856        let ner = HeuristicNER::new();
857        let entities = ner
858            .extract_entities("She lived in Paris for years.", None)
859            .unwrap();
860
861        let locs: Vec<_> = entities
862            .iter()
863            .filter(|e| matches!(e.entity_type, EntityType::Location))
864            .collect();
865        assert!(!locs.is_empty(), "Should detect Paris as location");
866    }
867
868    #[test]
869    fn test_known_organizations() {
870        let ner = HeuristicNER::new();
871        let entities = ner
872            .extract_entities("Google and Microsoft competed.", None)
873            .unwrap();
874
875        let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
876        assert!(
877            texts.iter().any(|t| t.contains("Google")),
878            "Should detect Google"
879        );
880        assert!(
881            texts.iter().any(|t| t.contains("Microsoft")),
882            "Should detect Microsoft"
883        );
884    }
885
886    #[test]
887    fn test_cjk_organization_detection() {
888        let ner = HeuristicNER::new();
889        let entities = ner
890            .extract_entities("ソニーが新製品を発表しました。", None)
891            .unwrap();
892
893        let orgs: Vec<_> = entities
894            .iter()
895            .filter(|e| matches!(e.entity_type, EntityType::Organization))
896            .collect();
897        assert!(
898            !orgs.is_empty(),
899            "Should detect Sony (ソニー) as organization"
900        );
901    }
902
903    #[test]
904    fn test_cjk_location_detection() {
905        let ner = HeuristicNER::new();
906        let entities = ner
907            .extract_entities("東京オリンピックが開催された。", None)
908            .unwrap();
909
910        let locs: Vec<_> = entities
911            .iter()
912            .filter(|e| matches!(e.entity_type, EntityType::Location))
913            .collect();
914        assert!(!locs.is_empty(), "Should detect Tokyo (東京) as location");
915    }
916
917    #[test]
918    fn test_empty_text() {
919        let ner = HeuristicNER::new();
920        let entities = ner.extract_entities("", None).unwrap();
921        assert!(entities.is_empty());
922    }
923
924    #[test]
925    fn test_no_entities() {
926        let ner = HeuristicNER::new();
927        let entities = ner
928            .extract_entities("the quick brown fox jumps over the lazy dog", None)
929            .unwrap();
930        // All lowercase, no entities expected
931        assert!(
932            entities.is_empty(),
933            "Lowercase text should have no entities"
934        );
935    }
936
937    #[test]
938    fn test_threshold_filtering() {
939        let low_threshold = HeuristicNER::with_threshold(0.1);
940        let high_threshold = HeuristicNER::with_threshold(0.9);
941
942        let text = "John works at Google.";
943        let low_entities = low_threshold.extract_entities(text, None).unwrap();
944        let high_entities = high_threshold.extract_entities(text, None).unwrap();
945
946        // Lower threshold should capture more or equal entities
947        assert!(low_entities.len() >= high_entities.len());
948    }
949
950    #[test]
951    fn test_sentence_starter_filtering() {
952        let ner = HeuristicNER::new();
953        let entities = ner
954            .extract_entities("The dog ran. It was fast.", None)
955            .unwrap();
956
957        // "The" and "It" should be filtered as common sentence starters
958        let texts: Vec<&str> = entities.iter().map(|e| e.text.as_str()).collect();
959        assert!(
960            !texts.contains(&"The"),
961            "Should filter 'The' as sentence starter"
962        );
963        assert!(!texts.contains(&"It"), "Should filter 'It' as pronoun");
964    }
965
966    #[test]
967    fn test_person_prefix_detection() {
968        let ner = HeuristicNER::new();
969        let entities = ner
970            .extract_entities("Prof. Einstein presented the theory.", None)
971            .unwrap();
972
973        let persons: Vec<_> = entities
974            .iter()
975            .filter(|e| matches!(e.entity_type, EntityType::Person))
976            .collect();
977        assert!(
978            !persons.is_empty(),
979            "Should detect Prof. Einstein as person"
980        );
981    }
982
983    #[test]
984    fn test_multi_word_organization() {
985        let ner = HeuristicNER::new();
986        let entities = ner
987            .extract_entities("Bank of America provides services.", None)
988            .unwrap();
989
990        let orgs: Vec<_> = entities
991            .iter()
992            .filter(|e| matches!(e.entity_type, EntityType::Organization))
993            .collect();
994        assert!(!orgs.is_empty(), "Should detect 'Bank of America' pattern");
995    }
996
997    #[test]
998    fn test_location_indicators() {
999        let ner = HeuristicNER::new();
1000        let entities = ner
1001            .extract_entities("New Zealand is beautiful.", None)
1002            .unwrap();
1003
1004        let locs: Vec<_> = entities
1005            .iter()
1006            .filter(|e| matches!(e.entity_type, EntityType::Location))
1007            .collect();
1008        assert!(!locs.is_empty(), "Should detect 'New Zealand' as location");
1009    }
1010
1011    #[test]
1012    fn test_model_trait_implementation() {
1013        let ner = HeuristicNER::new();
1014
1015        assert_eq!(ner.name(), "heuristic");
1016        assert!(ner.is_available());
1017        assert!(!ner.supported_types().is_empty());
1018        assert!(ner.description().contains("Heuristic"));
1019    }
1020
1021    #[test]
1022    fn test_entity_offsets_are_valid() {
1023        let ner = HeuristicNER::new();
1024        let text = "Barack Obama visited Berlin yesterday.";
1025        let entities = ner.extract_entities(text, None).unwrap();
1026
1027        let char_count = text.chars().count();
1028        for entity in &entities {
1029            assert!(entity.start <= entity.end, "start should be <= end");
1030            assert!(entity.end <= char_count, "end should be within text");
1031
1032            // Verify text matches span
1033            let extracted: String = text
1034                .chars()
1035                .skip(entity.start)
1036                .take(entity.end - entity.start)
1037                .collect();
1038            assert_eq!(
1039                extracted, entity.text,
1040                "Extracted text should match entity text"
1041            );
1042        }
1043    }
1044
1045    #[test]
1046    fn test_unicode_text_handling() {
1047        let ner = HeuristicNER::new();
1048        let text = "François Müller from München met José García.";
1049        let entities = ner.extract_entities(text, None).unwrap();
1050
1051        // Should handle diacritics correctly
1052        for entity in &entities {
1053            let extracted: String = text
1054                .chars()
1055                .skip(entity.start)
1056                .take(entity.end - entity.start)
1057                .collect();
1058            assert_eq!(extracted, entity.text, "Unicode offsets should be correct");
1059        }
1060    }
1061
1062    #[test]
1063    fn test_provenance_is_set() {
1064        let ner = HeuristicNER::new();
1065        let entities = ner
1066            .extract_entities("Google announced today.", None)
1067            .unwrap();
1068
1069        for entity in &entities {
1070            if let Some(ref prov) = entity.provenance {
1071                assert_eq!(prov.source, "heuristic");
1072                assert!(matches!(prov.method, ExtractionMethod::Heuristic));
1073            }
1074        }
1075    }
1076
1077    // =========================================================================
1078    // Acronym signal tests (domain-agnostic, language-agnostic)
1079    // =========================================================================
1080
1081    #[test]
1082    fn test_is_acronym_word_latin() {
1083        assert!(is_acronym_word("PARC"));
1084        assert!(is_acronym_word("IBM"));
1085        assert!(is_acronym_word("NASA"));
1086        assert!(is_acronym_word("N2K"));
1087        assert!(is_acronym_word("DARPA."));
1088        assert!(is_acronym_word("(NATO)"));
1089        assert!(!is_acronym_word("Xerox"));
1090        assert!(!is_acronym_word("Lynn"));
1091        assert!(!is_acronym_word("A"));
1092        assert!(!is_acronym_word("42"));
1093        assert!(!is_acronym_word(""));
1094    }
1095
1096    #[test]
1097    fn test_is_acronym_word_cyrillic() {
1098        assert!(is_acronym_word("\u{041D}\u{0410}\u{0422}\u{041E}")); // НАТО
1099        assert!(is_acronym_word("\u{041C}\u{0418}\u{0414}"));         // МИД
1100        assert!(!is_acronym_word("\u{041C}\u{043E}\u{0441}\u{043A}\u{0432}\u{0430}")); // Москва
1101    }
1102
1103    #[test]
1104    fn test_is_acronym_word_caseless_scripts() {
1105        assert!(!is_acronym_word("\u{6771}\u{4EAC}"));   // 東京 (CJK)
1106        assert!(!is_acronym_word("\u{30BD}\u{30CB}\u{30FC}")); // ソニー (Katakana)
1107        assert!(!is_acronym_word("\u{062D}\u{0645}\u{0627}\u{0633}")); // حماس (Arabic)
1108    }
1109
1110    #[test]
1111    fn test_acronym_in_multi_word_span_signals_org() {
1112        let ner = HeuristicNER::new();
1113        let entities = ner
1114            .extract_entities(
1115                "Lynn Conway worked at IBM and Xerox PARC in California.",
1116                None,
1117            )
1118            .unwrap();
1119        let xerox_parc = entities.iter().find(|e| e.text == "Xerox PARC");
1120        assert!(xerox_parc.is_some(), "Should detect 'Xerox PARC': {entities:?}");
1121        assert!(
1122            matches!(xerox_parc.unwrap().entity_type, EntityType::Organization),
1123            "Xerox PARC should be ORG, got {:?}",
1124            xerox_parc.unwrap().entity_type,
1125        );
1126    }
1127
1128    #[test]
1129    fn test_acronym_no_regression_on_normal_names() {
1130        let ner = HeuristicNER::new();
1131        let entities = ner
1132            .extract_entities("Lynn Conway designed the processor.", None)
1133            .unwrap();
1134        let lynn = entities.iter().find(|e| e.text == "Lynn Conway");
1135        assert!(lynn.is_some(), "Should detect 'Lynn Conway': {entities:?}");
1136        assert!(
1137            matches!(lynn.unwrap().entity_type, EntityType::Person),
1138            "Lynn Conway should remain PER, got {:?}",
1139            lynn.unwrap().entity_type,
1140        );
1141    }
1142
1143    #[test]
1144    fn test_single_acronym_signals_org() {
1145        let ner = HeuristicNER::new();
1146        let entities = ner
1147            .extract_entities("She joined DARPA last year.", None)
1148            .unwrap();
1149        let darpa = entities.iter().find(|e| e.text == "DARPA");
1150        assert!(darpa.is_some(), "Should detect 'DARPA': {entities:?}");
1151        assert!(
1152            matches!(darpa.unwrap().entity_type, EntityType::Organization),
1153            "DARPA should be ORG, got {:?}",
1154            darpa.unwrap().entity_type,
1155        );
1156    }
1157
1158    #[test]
1159    fn test_known_loc_acronym_still_loc() {
1160        let ner = HeuristicNER::new();
1161        let entities = ner
1162            .extract_entities("She moved to USA last year.", None)
1163            .unwrap();
1164        let usa = entities.iter().find(|e| e.text == "USA");
1165        assert!(usa.is_some(), "Should detect 'USA': {entities:?}");
1166        assert!(
1167            matches!(usa.unwrap().entity_type, EntityType::Location),
1168            "USA should be LOC (gazetteer wins), got {:?}",
1169            usa.unwrap().entity_type,
1170        );
1171    }
1172}
anno/backends/heuristic.rs

anno/backends/
heuristic.rs