anno/backends/heuristic/
mod.rs

1//! Heuristic NER - optimized for low Kolmogorov complexity
2//!
3//! A heuristic-based NER model that achieves reasonable performance with minimal
4//! complexity. The goal is to minimize description length while maximizing
5//! downstream quality.
6//!
7//! Core principles:
8//! 1. Exploit structural signals (capitalization, punctuation) - "free" features
9//! 2. Use high-precision patterns (Inc., Dr., in/from) - small fixed cost
10//! 3. Avoid large lexicons - high description cost per marginal gain
11
12use crate::{Entity, EntityType, ExtractionMethod, Model, Provenance, Result};
13
14/// Heuristic NER model.
15///
16/// A heuristic-based NER model optimized for low Kolmogorov complexity.
17/// Uses high-precision patterns with minimal lexical resources.
18#[derive(Debug, Clone)]
19pub struct HeuristicNER {
20    /// Minimum confidence threshold for entity extraction.
21    threshold: f64,
22}
23
24impl Default for HeuristicNER {
25    fn default() -> Self {
26        Self { threshold: 0.35 }
27    }
28}
29
30impl HeuristicNER {
31    /// Create a new HeuristicNER instance with default threshold.
32    #[must_use]
33    pub fn new() -> Self {
34        Self::default()
35    }
36
37    /// Create a new HeuristicNER with a custom confidence threshold.
38    #[must_use]
39    pub fn with_threshold(threshold: f64) -> Self {
40        Self { threshold }
41    }
42}
43
44// High-precision patterns (small, fixed cost)
45const ORG_SUFFIX: &[&str] = &[
46    "inc.",
47    "inc",
48    "corp.",
49    "corp",
50    "ltd.",
51    "ltd",
52    "llc",
53    "co.",
54    "plc",
55    "foundation",
56    "institute",
57    "university",
58    "college",
59    "bank",
60    "group",
61    "agency",
62    // International suffixes
63    "gmbh",
64    "ag",
65    "kg",
66    "sa",
67    "s.a.",
68    "s.l.",
69    "s.r.l.",
70    "spa",
71    "nv",
72    "bv",
73    "pty",
74    "ab",
75    "limited",
76    "corporation",
77    "incorporated",
78    "company",
79    "holding",
80    "holdings",
81];
82const PERSON_PREFIX: &[&str] = &[
83    "mr.", "mr", "ms.", "ms", "mrs.", "mrs", "dr.", "dr", "prof.", "prof",
84];
85const LOC_PREPOSITION: &[&str] = &[
86    "in", "from", "at", "to", "near", // German
87    "aus", "nach", "bei", "von", // French/Spanish/Italian
88    "en", "de", "à", "dans", "por", "sur",
89];
90// Common words that look like entities but aren't (all caps acronyms, titles)
91#[allow(dead_code)] // Used in classify_minimal
92const SKIP_WORDS: &[&str] = &[
93    "ceo",
94    "cto",
95    "cfo",
96    "vp",
97    "president",
98    "chairman",
99    "director",
100];
101
102// Words that commonly start sentences but are not entities
103const COMMON_SENTENCE_STARTERS: &[&str] = &[
104    "the",
105    "a",
106    "an",
107    "this",
108    "that",
109    "these",
110    "those",
111    "it",
112    "he",
113    "she",
114    "we",
115    "they",
116    "in",
117    "on",
118    "at",
119    "to",
120    "for",
121    "from",
122    "by",
123    "with",
124    "and",
125    "but",
126    "or",
127    "so",
128    "yet",
129    "if",
130    "because",
131    "contact",
132    "call",
133    "email",
134    "visit",
135    "please",
136    "see",
137    "note",
138    "today",
139    "yesterday",
140    "tomorrow",
141    "now",
142    "then",
143    "what",
144    "where",
145    "when",
146    "who",
147    "why",
148    "how",
149    "is",
150    "are",
151    "was",
152    "were",
153    "be",
154    "been",
155    "have",
156    "has",
157    "had",
158];
159
160// Minimal lexical knowledge (50 items each - high ROI)
161// These are the most common entities that are hard to distinguish structurally
162#[allow(dead_code)] // Used in classify_minimal but compiler sometimes misses it
163const KNOWN_ORGS: &[&str] = &[
164    "google",
165    "apple",
166    "microsoft",
167    "amazon",
168    "facebook",
169    "meta",
170    "tesla",
171    "twitter",
172    "ibm",
173    "intel",
174    "nvidia",
175    "oracle",
176    "cisco",
177    "samsung",
178    "sony",
179    "toyota",
180    "honda",
181    "bmw",
182    "mercedes",
183    "volkswagen",
184    "nasa",
185    "fbi",
186    "cia",
187    "nsa",
188    "nato",
189    "un",
190    "eu",
191    "bbc",
192    "cnn",
193    "nbc",
194    "cbs",
195    "abc",
196    "fox",
197    "nyt",
198    "wsj",
199    "reuters",
200    "bloomberg",
201    "spotify",
202    "netflix",
203    "uber",
204    "airbnb",
205    "paypal",
206    "visa",
207    "mastercard",
208    "amex",
209    // CJK Orgs
210    "ソニー",
211    "トヨタ",
212    "ホンダ",
213    "任天堂",
214    "サムスン",
215    "ファーウェイ",
216    "アリババ",
217    "テンセント",
218    "华为",
219    "阿里巴巴",
220    "腾讯",
221    "百度",
222    "小米",
223];
224
225#[allow(dead_code)] // Used in classify_minimal but compiler sometimes misses it
226const KNOWN_LOCS: &[&str] = &[
227    "paris",
228    "london",
229    "tokyo",
230    "berlin",
231    "rome",
232    "madrid",
233    "moscow",
234    "beijing",
235    "shanghai",
236    "dubai",
237    "singapore",
238    "sydney",
239    "toronto",
240    "chicago",
241    "boston",
242    "california",
243    "texas",
244    "florida",
245    "new york",
246    "washington",
247    "europe",
248    "asia",
249    "africa",
250    "america",
251    "australia",
252    "china",
253    "india",
254    "japan",
255    "germany",
256    "france",
257    "italy",
258    "spain",
259    "brazil",
260    "mexico",
261    "russia",
262    "korea",
263    "canada",
264    "uk",
265    "usa",
266    // CJK Locs
267    "東京",
268    "大阪",
269    "京都",
270    "北京",
271    "上海",
272    "香港",
273    "ソウル",
274    "台北",
275    "中国",
276    "日本",
277    "韓国",
278    "アメリカ",
279    "イギリス",
280    "フランス",
281    "ドイツ",
282];
283
284#[allow(dead_code)] // Used in classify_minimal
285const KNOWN_PERSONS: &[&str] = &[
286    "john", "jane", "mary", "james", "robert", "michael", "william", "david", "richard", "joseph",
287    "thomas", "charles", "barack", "donald", "joe", "george", "bill", "vladimir", "emmanuel",
288    "boris", "narendra", "justin", "elon", "jeff", "mark", "steve", "tim", "satya", "sundar",
289    "albert", "isaac", "stephen", "neil", "peter", "paul", "matthew", "andrew", "philip", "simon",
290    "marie", "angela", "hillary", "nancy", "kamala", "michelle", "melania", "jill", "theresa",
291    "ursula",
292];
293
294impl Model for HeuristicNER {
295    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
296        if text.is_empty() {
297            return Ok(vec![]);
298        }
299
300        let mut entities: Vec<Entity> = Vec::new();
301
302        // CJK Detection & Extraction
303        // Since CJK doesn't use spaces, we scan for known entities directly
304        let has_cjk = text.chars().any(
305            |c| {
306                ('\u{4e00}'..='\u{9fff}').contains(&c) || // CJK Unified Ideographs
307            ('\u{3040}'..='\u{309f}').contains(&c) || // Hiragana
308            ('\u{30a0}'..='\u{30ff}').contains(&c)
309            }, // Katakana
310        );
311
312        if has_cjk {
313            // Performance: Build byte-to-char mapping once for this text.
314            // ROI: High - reused across all CJK substring matches.
315            let converter = crate::offset::SpanConverter::new(text);
316
317            for &org in KNOWN_ORGS {
318                // Simple substring search for CJK terms
319                if org.chars().any(|c| c >= '\u{3040}') {
320                    let org_char_count = if org.is_ascii() {
321                        org.len()
322                    } else {
323                        org.chars().count()
324                    };
325
326                    // Include Hiragana/Katakana
327                    // Standard library substring search - efficient for typical NER workloads
328                    for (start_byte, _) in text.match_indices(org) {
329                        let char_start = converter.byte_to_char(start_byte);
330                        let char_end = char_start + org_char_count;
331                        // Avoid duplicates if already found (simple overlap check)
332                        if !entities
333                            .iter()
334                            .any(|e| e.start == char_start && e.end == char_end)
335                        {
336                            entities.push(Entity::new(
337                                org.to_string(),
338                                EntityType::Organization,
339                                char_start,
340                                char_end,
341                                0.9,
342                            ));
343                        }
344                    }
345                }
346            }
347            for &loc in KNOWN_LOCS {
348                if loc.chars().any(|c| c >= '\u{3040}') {
349                    let loc_char_count = if loc.is_ascii() {
350                        loc.len()
351                    } else {
352                        loc.chars().count()
353                    };
354
355                    // Standard library substring search - efficient for typical NER workloads
356                    for (start_byte, _) in text.match_indices(loc) {
357                        let char_start = converter.byte_to_char(start_byte);
358                        let char_end = char_start + loc_char_count;
359                        if !entities
360                            .iter()
361                            .any(|e| e.start == char_start && e.end == char_end)
362                        {
363                            entities.push(Entity::new(
364                                loc.to_string(),
365                                EntityType::Location,
366                                char_start,
367                                char_end,
368                                0.9,
369                            ));
370                        }
371                    }
372                }
373            }
374        }
375
376        // Build word list with character positions
377        // Robust strategy: Scan text linearly, identifying word boundaries.
378        // This avoids synchronization issues between split_whitespace and find.
379        let mut words_with_pos: Vec<(&str, usize, usize)> = Vec::new();
380
381        let mut in_word = false;
382        let mut word_start_byte = 0;
383        let mut word_start_char = 0;
384        let mut char_pos = 0;
385
386        for (i, c) in text.char_indices() {
387            if c.is_whitespace() {
388                if in_word {
389                    // Word ended
390                    let word = &text[word_start_byte..i];
391                    words_with_pos.push((word, word_start_char, char_pos));
392                    in_word = false;
393                }
394            } else if !in_word {
395                in_word = true;
396                word_start_byte = i;
397                word_start_char = char_pos;
398            }
399            char_pos += 1;
400        }
401        // Last word
402        if in_word {
403            let word = &text[word_start_byte..];
404            words_with_pos.push((word, word_start_char, char_pos));
405        }
406
407        let words: Vec<&str> = words_with_pos.iter().map(|(w, _, _)| *w).collect();
408
409        let mut i = 0;
410        while i < words.len() {
411            let word = words[i];
412
413            // Pre-check: Clean leading punctuation before checking capitalization
414            let clean_leading = word.trim_start_matches(|c: char| !c.is_alphanumeric());
415            if clean_leading.is_empty() {
416                i += 1;
417                continue;
418            }
419
420            // Only consider capitalized words as candidates
421            if !clean_leading
422                .chars()
423                .next()
424                .map(|c| c.is_uppercase())
425                .unwrap_or(false)
426            {
427                i += 1;
428                continue;
429            }
430
431            // Find span of consecutive capitalized words
432            // Only allow "of" and "the" as connectors (not "and" which separates entities)
433            let start_idx = i;
434
435            // Filter: Skip common sentence starters if this is the first word of a span
436            let first_word_lower = word.to_lowercase();
437            let first_word_clean = first_word_lower.trim_matches(|c: char| !c.is_alphanumeric());
438            if COMMON_SENTENCE_STARTERS.contains(&first_word_clean) {
439                i += 1;
440                continue;
441            }
442
443            while i < words.len() {
444                let w = words[i];
445                let w_clean = w.trim_start_matches(|c: char| !c.is_alphanumeric());
446
447                // Check if word ends with closing parenthesis - implies end of group
448                // Also check for sentence boundaries (. ! ?) unless it's a known suffix like Inc. or Mr.
449                // NOTE: We assume '.' inside a word (e.g. U.S.A.) is fine, but '.' at end is boundary.
450                // Unless next word is lower case (abbreviation).
451                let ends_with_closing = w.ends_with([')', ']', '}']);
452                let ends_with_punct = w.ends_with(['.', '!', '?']);
453
454                let first_char_upper = w_clean
455                    .chars()
456                    .next()
457                    .map(|c| c.is_uppercase())
458                    .unwrap_or(false);
459
460                // Only "of" and "the" connect entity names (e.g., "Bank of America", "The New York Times")
461                // "and" separates entities (e.g., "Paris and London" are two entities)
462                let is_connector = matches!(w.to_lowercase().as_str(), "of" | "the");
463
464                // Check next word
465                let next_word_ok = if i + 1 < words.len() {
466                    let next = words[i + 1];
467                    let next_clean = next.trim_start_matches(|c: char| !c.is_alphanumeric());
468                    let next_upper = next_clean
469                        .chars()
470                        .next()
471                        .map(|c| c.is_uppercase())
472                        .unwrap_or(false);
473
474                    // Special case: "Inc", "Corp" etc can follow a closing parenthesis
475                    // e.g. "Google) Inc" -> merged
476                    let is_suffix = ORG_SUFFIX.contains(&&*next_clean.to_lowercase());
477
478                    if (ends_with_closing || ends_with_punct) && !is_suffix {
479                        false // Break span at closing parenthesis/punctuation unless followed by suffix
480                    } else {
481                        next_upper
482                    }
483                } else {
484                    false
485                };
486
487                if first_char_upper || (is_connector && next_word_ok) {
488                    i += 1;
489                    // If this word ended with closing parenthesis, and we didn't break above (because next is suffix),
490                    // continue. If we broke above, loop terminates.
491                    if ends_with_closing || ends_with_punct {
492                        let is_suffix_next = if let Some(next_w) = words.get(i) {
493                            let clean = next_w.to_lowercase();
494                            let clean_ref = clean.trim_matches(|c: char| !c.is_alphanumeric());
495                            ORG_SUFFIX.contains(&clean_ref)
496                        } else {
497                            false
498                        };
499
500                        if !is_suffix_next {
501                            break;
502                        }
503                    }
504                } else {
505                    break;
506                }
507            }
508            let end_idx = i;
509
510            if start_idx == end_idx {
511                continue;
512            }
513
514            // Extract the span
515            let span_words = &words[start_idx..end_idx];
516            let mut entity_text = span_words.join(" ");
517
518            // Check if previous word is a person prefix (e.g., "Dr.", "Mr.")
519            let prev_word = if start_idx > 0 {
520                Some(
521                    words[start_idx - 1]
522                        .to_lowercase()
523                        .trim_end_matches('.')
524                        .to_string(),
525                )
526            } else {
527                None
528            };
529            let should_include_prefix = prev_word
530                .as_ref()
531                .map(|p| PERSON_PREFIX.contains(&p.as_str()))
532                .unwrap_or(false);
533
534            // If previous word is a person prefix, include it in the entity text
535            if should_include_prefix {
536                let prefix_word = &words[start_idx - 1];
537                entity_text = format!("{} {}", prefix_word, entity_text);
538                // Adjust start position to include prefix
539                let prefix_char_start = words_with_pos[start_idx - 1].1;
540                let char_start = prefix_char_start;
541                let char_end = char_start + entity_text.chars().count();
542
543                // Classify based on minimal rules
544                let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
545                let (entity_type, confidence, reason) =
546                    classify_minimal(&clean_span_words, &words, start_idx - 1);
547
548                // Skip low-confidence and filtered entities
549                if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
550                    entities.push(Entity::with_provenance(
551                        entity_text,
552                        entity_type,
553                        char_start,
554                        char_end,
555                        confidence,
556                        Provenance {
557                            source: "heuristic".into(),
558                            method: ExtractionMethod::Heuristic,
559                            pattern: Some(reason.into()),
560                            raw_confidence: Some(confidence),
561                            model_version: None,
562                            timestamp: None,
563                        },
564                    ));
565                }
566                continue; // Skip the normal processing below
567            }
568
569            // Clean leading punctuation from first word (but not person prefixes)
570            let leading_punct_len = entity_text.len()
571                - entity_text
572                    .trim_start_matches(|c: char| !c.is_alphanumeric())
573                    .len();
574            if leading_punct_len > 0 {
575                entity_text = entity_text[leading_punct_len..].to_string();
576            }
577
578            // Clean trailing punctuation from the last word
579            while entity_text.ends_with(|c: char| !c.is_alphanumeric()) {
580                entity_text.pop();
581            }
582
583            // Skip if entity became empty after cleaning
584            if entity_text.is_empty() {
585                continue;
586            }
587
588            // Get character offsets from our position tracking
589            // Correct start offset by adding leading punctuation length
590            let char_start = words_with_pos[start_idx].1 + leading_punct_len;
591            // Performance: Use entity_text.len() for ASCII, fallback to chars().count() for Unicode
592            let char_end = char_start
593                + if entity_text.is_ascii() {
594                    entity_text.len()
595                } else {
596                    entity_text.chars().count()
597                };
598
599            // Classify based on minimal rules
600            // Use cleaned span for classification to avoid punctuation noise
601            let clean_span_words: Vec<&str> = entity_text.split_whitespace().collect();
602            let (entity_type, confidence, reason) =
603                classify_minimal(&clean_span_words, &words, start_idx);
604
605            // Skip low-confidence and filtered entities
606            if confidence >= self.threshold && !matches!(entity_type, EntityType::Other(_)) {
607                entities.push(Entity::with_provenance(
608                    entity_text,
609                    entity_type,
610                    char_start,
611                    char_end,
612                    confidence,
613                    Provenance {
614                        source: "heuristic".into(),
615                        method: ExtractionMethod::Heuristic,
616                        pattern: Some(reason.into()),
617                        raw_confidence: Some(confidence),
618                        model_version: None,
619                        timestamp: None,
620                    },
621                ));
622            }
623        }
624
625        Ok(entities)
626    }
627
628    fn supported_types(&self) -> Vec<EntityType> {
629        vec![
630            EntityType::Person,
631            EntityType::Organization,
632            EntityType::Location,
633        ]
634    }
635
636    fn is_available(&self) -> bool {
637        true
638    }
639
640    fn name(&self) -> &'static str {
641        "heuristic"
642    }
643
644    fn description(&self) -> &'static str {
645        "Heuristic NER optimized for low complexity"
646    }
647
648    fn capabilities(&self) -> crate::ModelCapabilities {
649        crate::ModelCapabilities {
650            batch_capable: true,
651            streaming_capable: true,
652            ..Default::default()
653        }
654    }
655}
656
657/// Domain-agnostic, language-agnostic acronym check.  True when every
658/// alphabetic character is uppercase and there are at least 2 alphabetic
659/// characters.  Unicode-aware: works for Latin (NASA), Cyrillic (НАТО),
660/// and gracefully returns false for caseless scripts (CJK, Arabic).
661fn is_acronym_word(w: &str) -> bool {
662    let clean = w.trim_matches(|c: char| !c.is_alphanumeric());
663    let alpha_count = clean.chars().filter(|c| c.is_alphabetic()).count();
664    alpha_count >= 2
665        && clean
666            .chars()
667            .filter(|c| c.is_alphabetic())
668            .all(|c| c.is_uppercase())
669}
670
671fn classify_minimal(
672    span: &[&str],
673    all_words: &[&str],
674    start_idx: usize,
675) -> (EntityType, f64, &'static str) {
676    let last_word = span.last().map(|s| s.to_lowercase()).unwrap_or_default();
677    let first_word = span.first().map(|s| s.to_lowercase()).unwrap_or_default();
678    let span_lower = span
679        .iter()
680        .map(|s| s.to_lowercase())
681        .collect::<Vec<_>>()
682        .join(" ");
683
684    // Get context
685    let prev_word = if start_idx > 0 {
686        Some(all_words[start_idx - 1].to_lowercase())
687    } else {
688        None
689    };
690
691    // Filter: Skip common pronouns/articles/titles that get capitalized
692    let skip_pronouns = [
693        "the", "a", "an", "he", "she", "it", "they", "we", "i", "you",
694    ];
695    if span.len() == 1 && skip_pronouns.contains(&first_word.as_str()) {
696        return (EntityType::Other("skip".into()), 0.0, "skip_pronoun");
697    }
698    // Filter: Skip job titles and common non-entity nouns
699    let first_clean_lc = first_word
700        .trim_end_matches(|c: char| !c.is_alphanumeric())
701        .to_lowercase();
702    if span.len() == 1 && SKIP_WORDS.contains(&first_clean_lc.as_str()) {
703        return (EntityType::Other("skip".into()), 0.0, "skip_word");
704    }
705
706    // Rule 1: ORG suffix (highest precision)
707    let last_clean: &str = last_word.trim_end_matches(|c: char| !c.is_alphanumeric());
708    if ORG_SUFFIX.contains(&last_clean) {
709        return (EntityType::Organization, 0.85, "org_suffix");
710    }
711
712    // Rule 2: Known organization name
713    let first_clean_text = first_word.trim_end_matches(|c: char| !c.is_alphanumeric());
714    if KNOWN_ORGS.contains(&first_clean_text) || KNOWN_ORGS.contains(&span_lower.as_str()) {
715        return (EntityType::Organization, 0.80, "known_org");
716    }
717
718    // Rule 3: Known location name
719    if KNOWN_LOCS.contains(&first_clean_text) || KNOWN_LOCS.contains(&span_lower.as_str()) {
720        return (EntityType::Location, 0.80, "known_location");
721    }
722
723    // Rule 3.5: Known person name
724    if KNOWN_PERSONS.contains(&first_clean_text) {
725        return (EntityType::Person, 0.75, "common_name");
726    }
727
728    // Rule 4: Person prefix in previous word
729    if let Some(prev) = &prev_word {
730        let prev_clean: &str = prev.trim_end_matches('.');
731        if PERSON_PREFIX.contains(&prev_clean) {
732            return (EntityType::Person, 0.80, "person_prefix_context");
733        }
734    }
735
736    // Rule 5: First word is a title -> Person
737    let first_clean: &str = first_word.trim_end_matches('.');
738    if PERSON_PREFIX.contains(&first_clean) && span.len() >= 2 {
739        return (EntityType::Person, 0.75, "person_prefix_span");
740    }
741
742    // Rule 5.5: Acronym signal (domain-agnostic, language-agnostic).
743    // Excludes SKIP_WORDS (CEO/CTO/VP) which are role titles, not entities.
744    if span.len() >= 2 {
745        let has_real_acronym = span.iter().any(|w| {
746            is_acronym_word(w) && {
747                let lc = w.to_lowercase();
748                !SKIP_WORDS.contains(&lc.trim_matches(|c: char| !c.is_alphanumeric()))
749            }
750        });
751        if has_real_acronym {
752            return (EntityType::Organization, 0.70, "acronym_in_span");
753        }
754    }
755
756    // Rule 6: Location preposition context
757    if let Some(prev) = &prev_word {
758        if LOC_PREPOSITION.contains(&prev.as_str()) {
759            return (EntityType::Location, 0.70, "loc_context");
760        }
761    }
762
763    // Rule 7: Two capitalized words (likely person name)
764    // Unless it looks like a country/place (contains "United", "New", etc.)
765    if span.len() == 2 {
766        let place_indicators = ["united", "new", "south", "north", "west", "east", "great"];
767        if place_indicators.contains(&first_word.as_str()) {
768            return (EntityType::Location, 0.65, "loc_indicator");
769        }
770        return (EntityType::Person, 0.60, "two_word_name");
771    }
772
773    // Rule 8: Three+ words -> likely ORG or LOC, not PER
774    if span.len() >= 3 {
775        // "Bank of X" pattern -> ORG
776        if span.len() >= 2 && span[1].to_lowercase() == "of" {
777            return (EntityType::Organization, 0.65, "org_of_pattern");
778        }
779        return (EntityType::Organization, 0.50, "long_span_org");
780    }
781
782    // Rule 9: Single-word structural signals
783    if span.len() == 1 {
784        let word = span[0].trim_matches(|c: char| !c.is_alphanumeric());
785        if word.len() == 1 {
786            return (EntityType::Other("skip".into()), 0.0, "single_letter");
787        }
788        if is_acronym_word(word) {
789            let lc = word.to_lowercase();
790            if !SKIP_WORDS.contains(&lc.as_str()) {
791                return (EntityType::Organization, 0.55, "single_acronym");
792            }
793        }
794    }
795
796    // Rule 10: Single word at sentence start with no context - very low confidence
797    if start_idx == 0 && prev_word.is_none() {
798        return (EntityType::Person, 0.30, "single_start_word");
799    }
800
801    // Default: single capitalized word mid-sentence - assume Person
802    (EntityType::Person, 0.45, "capitalized")
803}
804
805impl crate::NamedEntityCapable for HeuristicNER {}
806
807// =============================================================================
808// BatchCapable Trait Implementation
809// =============================================================================
810
811impl crate::BatchCapable for HeuristicNER {
812    fn optimal_batch_size(&self) -> Option<usize> {
813        Some(16) // HeuristicNER is fast, can handle larger batches
814    }
815}
816
817// =============================================================================
818// StreamingCapable Trait Implementation
819// =============================================================================
820
821impl crate::StreamingCapable for HeuristicNER {
822    fn recommended_chunk_size(&self) -> usize {
823        8192 // Characters - heuristics are lightweight
824    }
825}
826
827#[cfg(test)]
828mod tests;
anno/backends/heuristic/mod.rs

anno/backends/heuristic/
mod.rs