anno/backends/
rule.rs

1//! Rule-based Named Entity Recognition (NER) - **DEPRECATED**.
2//!
3//! **Use `RegexNER` instead** - it only extracts format-based entities
4//! (dates, money, percentages) without hardcoded gazetteers.
5//!
6//! ## Why Deprecated
7//!
8//! This module contains hardcoded gazetteers (100+ org names, 60+ locations)
9//! that can score well on curated test sets but fail on novel entities.
10//!
11//! ## Migration
12//!
13//! ```rust
14//! use anno::{RegexNER, Model};
15//!
16//! // Use RegexNER for dates, money, percentages
17//! let model = RegexNER::new();
18//! let entities = model.extract_entities("Cost: $100", None).unwrap();
19//! // For Person/Org/Location, enable `onnx` feature and use BertNEROnnx
20//! ```
21//!
22//! ## If You Must Use This
23//!
24//! - For legacy compatibility only
25//! - For deterministic/reproducible results on known entity sets
26//! - For extremely low-latency requirements
27//!
28//! But understand: it cannot generalize to unseen entities.
29
30use crate::offset::TextSpan;
31use crate::{Entity, EntityType, Model, Result};
32use once_cell::sync::Lazy;
33use regex::Regex;
34
35/// Rule-based NER (**DEPRECATED** - use `RegexNER` or ML backends).
36///
37/// Contains hardcoded gazetteers that give inflated F1 on curated tests
38/// but fail on novel entities. Use `NERExtractor::best_available()` instead.
39#[deprecated(
40    since = "0.1.0",
41    note = "Use RegexNER (no gazetteers) or ML backends (BERT ONNX). Will be removed in 1.0."
42)]
43pub struct RuleBasedNER {
44    /// Minimum confidence for extracted entities
45    min_confidence: f64,
46    /// Whether to filter common words
47    filter_common: bool,
48}
49
50#[allow(deprecated)]
51impl RuleBasedNER {
52    /// Create a new rule-based NER with default settings.
53    pub fn new() -> Self {
54        Self {
55            min_confidence: 0.3, // Low threshold - let downstream filter by confidence
56            filter_common: true,
57        }
58    }
59
60    /// Create with custom minimum confidence.
61    #[must_use]
62    pub fn with_min_confidence(min_confidence: f64) -> Self {
63        Self {
64            min_confidence,
65            filter_common: true,
66        }
67    }
68
69    /// Create without common word filtering (for debugging).
70    #[allow(dead_code)]
71    pub fn without_filtering() -> Self {
72        Self {
73            min_confidence: 0.3,
74            filter_common: false,
75        }
76    }
77}
78
79#[allow(deprecated)]
80impl Default for RuleBasedNER {
81    fn default() -> Self {
82        Self::new()
83    }
84}
85
86#[allow(deprecated)]
87impl Model for RuleBasedNER {
88    fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
89        let mut entities = Vec::new();
90
91        // ========================================================================
92        // PRIORITY ORDER: Known orgs > Org patterns > Locations > Persons > Other
93        // Higher-confidence patterns run first to avoid type confusion
94        // ========================================================================
95
96        // Pattern 0: Well-known organizations (acronyms and single words)
97        // These are high-confidence matches that would otherwise be missed
98        static KNOWN_ORGS: Lazy<Regex> = Lazy::new(|| {
99            // Tech, Government, Academic, Conferences + Sports Leagues/Teams
100            Regex::new(r"\b(?:NASA|FBI|CIA|NSA|NIH|FDA|CDC|EPA|WHO|NATO|UN|EU|IMF|WTO|CERN|MIT|UCLA|DARPA|OECD|OPEC|IEEE|ACM|AWS|GCP|IBM|HP|AMD|ARM|NVIDIA|Intel|Apple|Google|Microsoft|Amazon|Meta|OpenAI|Anthropic|DeepMind|Pfizer|Moderna|Rivian|BYD|Netflix|Uber|Airbnb|NeurIPS|ICML|ICLR|CVPR|ACL|EMNLP|NAACL|IPCC|SEC|FCC|DOJ|DOE|DOD|USDA|HUD|IRS|FEMA|OSHA|NOAA|NSF|USPTO|FTC|NIST|DOT|VA|SSA|SBA|FAA|TSA|ICE|CBP|USCIS|NFL|NBA|MLB|NHL|MLS|FIFA|UEFA|IOC|NCAA|PGA|ATP|WTA|UFC|WWE|ESPN|LEICESTERSHIRE|DERBYSHIRE|YORKSHIRE|SURREY|ESSEX|WARWICKSHIRE|SUSSEX|MIDDLESEX|HAMPSHIRE|SOMERSET|KENT|LANCASHIRE|GLOUCESTERSHIRE|NOTTINGHAMSHIRE|NORTHAMPTONSHIRE|WORCESTERSHIRE|DURHAM)\b")
101                .expect("Failed to compile known orgs pattern")
102        });
103
104        for cap in KNOWN_ORGS.find_iter(text) {
105            let span = TextSpan::from_bytes(text, cap.start(), cap.end());
106            entities.push(Entity::new(
107                cap.as_str(),
108                EntityType::Organization,
109                span.char_start,
110                span.char_end,
111                0.95, // Very high confidence for known orgs
112            ));
113        }
114
115        // Pattern 1: Organizations (Inc., Corp., Corporation, Ltd., University, etc.)
116        // Run FIRST to avoid "Microsoft Corporation" being tagged as Person
117        static ORG_PATTERN: Lazy<Regex> = Lazy::new(|| {
118            Regex::new(r"\b[A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\s+(?:Inc\.?|Corp\.?|Corporation|Ltd\.?|LLC|GmbH|University|Institute|Foundation|Laboratory|Labs?|Company|Technologies|Systems|Research|Group|Partners|Associates|Agency|Commission|Court|Council|Board|Committee|Organization|Organisation|Bank|Reserve|Museum)\b")
119                .expect("Failed to compile org pattern")
120        });
121
122        for cap in ORG_PATTERN.find_iter(text) {
123            let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
124            // Skip if already covered
125            if entities
126                .iter()
127                .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
128            {
129                continue;
130            }
131            let text_str = strip_leading_article(cap.as_str());
132            let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
133            let span = TextSpan::from_bytes(text, start_adj, cap.end());
134            entities.push(Entity::new(
135                text_str,
136                EntityType::Organization,
137                span.char_start,
138                span.char_end,
139                0.85, // High confidence for explicit org suffixes
140            ));
141        }
142
143        // Pattern 2: Locations (city, country patterns - expanded)
144        static LOCATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
145            Regex::new(r"\b(?:New\s+York(?:\s+City)?|San\s+Francisco|Los\s+Angeles|Washington(?:\s+D\.?C\.?)?|Tokyo\s+Bay|United\s+States|United\s+Kingdom|European\s+Union|Asia-Pacific|North\s+America|South\s+America|Atlantic\s+Ocean|Pacific\s+Ocean|Amazon\s+River|Tokyo|Berlin|Paris|London|Beijing|Shanghai|Mumbai|Sydney|Moscow|Dubai|Seoul|Singapore|Hong\s+Kong|Brazil|Peru|Colombia|China|Japan|Germany|France|Italy|Spain|Canada|Australia|India|Russia|Mexico|Argentina|Chile|Ukraine|California|Texas|Florida|Illinois|Seattle|Chicago|Boston|Atlanta|Denver|Phoenix|Portland|Miami|Cupertino|Redmond|Wuhan|Geneva)\b")
146                .expect("Failed to compile location pattern")
147        });
148
149        for cap in LOCATION_PATTERN.find_iter(text) {
150            let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
151            // Skip if already covered by organization
152            if entities
153                .iter()
154                .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
155            {
156                continue;
157            }
158            let text_str = strip_leading_article(cap.as_str());
159            let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
160            let span = TextSpan::from_bytes(text, start_adj, cap.end());
161            entities.push(Entity::new(
162                text_str,
163                EntityType::Location,
164                span.char_start,
165                span.char_end,
166                0.9,
167            ));
168        }
169
170        // Pattern 3: Person names (common first+last name patterns)
171        // Look for patterns like "John Smith", "J. Smith", "Dr. Smith", "Chen et al."
172        // Exclude patterns starting with "The "
173        static PERSON_PATTERN: Lazy<Regex> = Lazy::new(|| {
174            Regex::new(r"(?:Dr\.|Mr\.|Mrs\.|Ms\.|Prof\.|Chairman|CEO|President|Director|Justice|General|Commissioner|Coach|Governor|Senator|Mayor)\s+[A-Z][a-z]+(?:\s+[a-z]+\s+[A-Z][a-z]+|\s+[A-Z][a-z]+)?|[A-Z][a-z]+\s+(?:et\s+al\.?)|[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?")
175                .expect("Failed to compile person pattern")
176        });
177
178        for cap in PERSON_PATTERN.find_iter(text) {
179            let mut text_str = cap.as_str();
180            // Strip leading "The " if present
181            text_str = strip_leading_article(text_str);
182
183            let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
184            // Skip if overlaps with already-extracted org/location (higher priority)
185            if entities
186                .iter()
187                .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
188            {
189                continue;
190            }
191            // Skip common words or sentences starting with "The"
192            if self.filter_common
193                && (is_common_capitalized_word(text_str) || starts_with_noise(text_str))
194            {
195                continue;
196            }
197            let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
198            let span = TextSpan::from_bytes(text, start_adj, cap.end());
199            entities.push(Entity::new(
200                text_str,
201                EntityType::Person,
202                span.char_start,
203                span.char_end,
204                0.7,
205            ));
206        }
207
208        // Pattern 4: Other capitalized phrases (less confident)
209        static CAPITALIZED_PATTERN: Lazy<Regex> = Lazy::new(|| {
210            Regex::new(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b")
211                .expect("Failed to compile capitalized pattern")
212        });
213
214        for cap in CAPITALIZED_PATTERN.find_iter(text) {
215            let mut text_str = cap.as_str();
216            // Strip leading "The " if present
217            text_str = strip_leading_article(text_str);
218            if text_str.is_empty() {
219                continue;
220            }
221            // Skip common words that are capitalized but not entities
222            if self.filter_common
223                && (is_common_capitalized_word(text_str) || starts_with_noise(text_str))
224            {
225                continue;
226            }
227            let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
228            // Skip if already matched by more specific patterns (org, location, person)
229            if entities
230                .iter()
231                .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
232            {
233                continue;
234            }
235            // Use heuristics to infer type
236            let entity_type = infer_entity_type(text_str);
237            let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
238            let span = TextSpan::from_bytes(text, start_adj, cap.end());
239            entities.push(Entity::new(
240                text_str,
241                entity_type,
242                span.char_start,
243                span.char_end,
244                0.4, // Lower confidence for generic matches
245            ));
246        }
247
248        // Pattern 5: Dates - expanded to catch more formats
249        static DATE_PATTERN: Lazy<Regex> = Lazy::new(|| {
250            Regex::new(r"\b(?:\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:,\s*\d{4})?|\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)(?:\s+\d{4})?|(?:Q[1-4]|(?:19|20)\d{2}))\b")
251                .expect("Failed to compile date pattern")
252        });
253
254        for date_match in DATE_PATTERN.find_iter(text) {
255            let span = TextSpan::from_bytes(text, date_match.start(), date_match.end());
256            // Skip if overlaps with existing entity
257            if entities
258                .iter()
259                .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
260            {
261                continue;
262            }
263            entities.push(Entity::new(
264                date_match.as_str(),
265                EntityType::Date,
266                span.char_start,
267                span.char_end,
268                0.8,
269            ));
270        }
271
272        // Pattern 6: Money amounts (enhanced)
273        static MONEY_PATTERN: Lazy<Regex> = Lazy::new(|| {
274            Regex::new(r"\$[\d,]+\.?\d*\s*(?:billion|million|thousand|B|M|K)?|\d+\.?\d*\s*(?:dollars?|USD|EUR|GBP|billion|million)")
275                .expect("Failed to compile money pattern")
276        });
277
278        for money_match in MONEY_PATTERN.find_iter(text) {
279            let span = TextSpan::from_bytes(text, money_match.start(), money_match.end());
280            // Skip if overlaps with existing entity
281            if entities
282                .iter()
283                .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
284            {
285                continue;
286            }
287            entities.push(Entity::new(
288                money_match.as_str(),
289                EntityType::Money,
290                span.char_start,
291                span.char_end,
292                0.8,
293            ));
294        }
295
296        // Pattern 7: Percentages
297        static PERCENT_PATTERN: Lazy<Regex> =
298            Lazy::new(|| Regex::new(r"\d+\.?\d*\s*%").expect("Failed to compile percent pattern"));
299
300        for percent_match in PERCENT_PATTERN.find_iter(text) {
301            let span = TextSpan::from_bytes(text, percent_match.start(), percent_match.end());
302            // Skip if overlaps with existing entity
303            if entities
304                .iter()
305                .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
306            {
307                continue;
308            }
309            entities.push(Entity::new(
310                percent_match.as_str(),
311                EntityType::Percent,
312                span.char_start,
313                span.char_end,
314                0.8,
315            ));
316        }
317
318        // Filter by minimum confidence
319        entities.retain(|e| e.confidence >= self.min_confidence);
320
321        // Post-process: remove entities that start with "The " (final cleanup)
322        entities.retain(|e| !e.text.starts_with("The "));
323
324        Ok(entities)
325    }
326
327    fn supported_types(&self) -> Vec<EntityType> {
328        vec![
329            EntityType::Person,
330            EntityType::Organization,
331            EntityType::Location,
332            EntityType::Date,
333            EntityType::Money,
334            EntityType::Percent,
335            EntityType::Other("unknown".to_string()),
336        ]
337    }
338
339    fn is_available(&self) -> bool {
340        true // Rule-based is always available
341    }
342
343    fn name(&self) -> &'static str {
344        "rule"
345    }
346
347    fn description(&self) -> &'static str {
348        "Rule-based NER using regex patterns and heuristics"
349    }
350}
351
352/// Check if two spans overlap.
353fn spans_overlap(s1_start: usize, s1_end: usize, s2_start: usize, s2_end: usize) -> bool {
354    !(s1_end <= s2_start || s2_end <= s1_start)
355}
356
357/// Strip leading articles ("The ", "A ", "An ") from entity text.
358fn strip_leading_article(text: &str) -> &str {
359    text.strip_prefix("The ")
360        .or_else(|| text.strip_prefix("A "))
361        .or_else(|| text.strip_prefix("An "))
362        .unwrap_or(text)
363}
364
365/// Check if text starts with common noise patterns.
366fn starts_with_noise(text: &str) -> bool {
367    // These patterns often lead to false positives
368    let noise_starts = [
369        "According",
370        "Based",
371        "Given",
372        "Following",
373        "Regarding",
374        "Attention Is",
375        "All You", // Common paper title fragments
376    ];
377    noise_starts.iter().any(|n| text.starts_with(n))
378}
379
380/// Infer entity type from text using simple heuristics.
381///
382/// This provides better typing than "unknown" for common patterns.
383fn infer_entity_type(text: &str) -> EntityType {
384    let lower = text.to_lowercase();
385    let words: Vec<&str> = text.split_whitespace().collect();
386
387    // Looks like a person name (2-3 word capitalized, common name patterns)
388    if words.len() == 2 || words.len() == 3 {
389        // Check if looks like "Firstname Lastname"
390        if words
391            .iter()
392            .all(|w| w.chars().next().map(|c| c.is_uppercase()).unwrap_or(false))
393        {
394            // Common Chinese/Korean surnames are a strong signal
395            if is_common_surname(words[0])
396                || (words.len() > 1 && words.last().is_some_and(|w| is_common_surname(w)))
397            {
398                return EntityType::Person;
399            }
400        }
401    }
402
403    // Single word with common surname
404    if words.len() == 1 && is_common_surname(text) {
405        return EntityType::Person;
406    }
407
408    // Technical/concept terms
409    if lower.contains("network")
410        || lower.contains("model")
411        || lower.contains("algorithm")
412        || lower.contains("learning")
413        || lower.contains("neural")
414        || lower.contains("transformer")
415    {
416        return EntityType::Other("concept".to_string());
417    }
418
419    // Acronyms (all caps, 2-5 chars) are often organizations or technical terms
420    if text.len() >= 2 && text.len() <= 5 && text.chars().all(|c| c.is_uppercase()) {
421        return EntityType::Other("acronym".to_string());
422    }
423
424    EntityType::Other("unknown".to_string())
425}
426
427/// Check if a word is a common surname (for person detection).
428fn is_common_surname(word: &str) -> bool {
429    static COMMON_SURNAMES: &[&str] = &[
430        // Chinese surnames (very common in academic papers)
431        "Wang", "Li", "Zhang", "Liu", "Chen", "Yang", "Huang", "Zhao", "Wu", "Zhou", "Xu", "Sun",
432        "Ma", "Zhu", "Hu", "Guo", "Lin", "He", "Gao", "Luo", "Zheng", "Liang", "Xie", "Tang",
433        "Han", "Feng", "Deng", "Cao", "Peng", "Xiao", "Jiang", "Cheng", "Yuan", "Lu", "Pan",
434        "Ding", "Wei", "Ren", "Shao", "Qian", // Korean surnames
435        "Kim", "Lee", "Park", "Choi", "Jung", "Kang", "Cho", "Yoon", "Jang", "Lim",
436        // Japanese surnames
437        "Tanaka", "Suzuki", "Yamamoto", "Watanabe", "Sato", "Ito", "Nakamura",
438        // Western surnames (common in papers)
439        "Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Wilson", "Moore",
440        "Taylor", "Anderson", "Thomas", "White", "Harris",
441    ];
442    COMMON_SURNAMES.contains(&word)
443}
444
445/// Check if a capitalized word is a common word (not an entity).
446///
447/// This extensive list filters out noise from PDF extraction and academic papers.
448/// Words are matched exactly (case-sensitive).
449fn is_common_capitalized_word(word: &str) -> bool {
450    // Use a static HashSet for O(1) lookups
451    use std::collections::HashSet;
452    use std::sync::OnceLock;
453
454    static COMMON_WORDS: OnceLock<HashSet<&'static str>> = OnceLock::new();
455
456    let common_words = COMMON_WORDS.get_or_init(|| {
457        let words: &[&str] = &[
458            // Pronouns and determiners
459            "The",
460            "A",
461            "An",
462            "This",
463            "That",
464            "These",
465            "Those",
466            "I",
467            "You",
468            "He",
469            "She",
470            "It",
471            "We",
472            "They",
473            "My",
474            "Your",
475            "His",
476            "Her",
477            "Its",
478            "Our",
479            "Their",
480            "What",
481            "Which",
482            "Who",
483            "Whom",
484            // Conjunctions and prepositions
485            "And",
486            "Or",
487            "But",
488            "If",
489            "When",
490            "Where",
491            "Why",
492            "How",
493            "As",
494            "At",
495            "By",
496            "For",
497            "From",
498            "In",
499            "Into",
500            "Of",
501            "On",
502            "To",
503            "With",
504            "About",
505            "After",
506            "Against",
507            "Before",
508            "Between",
509            "During",
510            "Through",
511            "Under",
512            "Over",
513            "Above",
514            "Below",
515            "Since",
516            "Until",
517            "Upon",
518            // Verbs (common)
519            "Is",
520            "Are",
521            "Was",
522            "Were",
523            "Be",
524            "Been",
525            "Being",
526            "Have",
527            "Has",
528            "Had",
529            "Do",
530            "Does",
531            "Did",
532            "Will",
533            "Would",
534            "Could",
535            "Should",
536            "May",
537            "Might",
538            "Can",
539            "Cannot",
540            "Let",
541            "Get",
542            "Got",
543            "Make",
544            "Made",
545            "Take",
546            "Took",
547            "Give",
548            "Gave",
549            "See",
550            "Saw",
551            "Know",
552            "Knew",
553            "Think",
554            "Thought",
555            "Want",
556            "Use",
557            "Used",
558            "Using",
559            "Find",
560            // Academic/document noise
561            "Figure",
562            "Table",
563            "Section",
564            "Chapter",
565            "Page",
566            "Abstract",
567            "Introduction",
568            "Conclusion",
569            "Conclusions",
570            "Discussion",
571            "Method",
572            "Methods",
573            "Results",
574            "References",
575            "Appendix",
576            "Acknowledgments",
577            "Background",
578            "Related",
579            "Work",
580            "Paper",
581            "Papers",
582            "Study",
583            "Studies",
584            "Research",
585            "Analysis",
586            "Data",
587            "Model",
588            "Models",
589            "Approach",
590            "Problem",
591            "Solution",
592            "System",
593            "Systems",
594            "Algorithm",
595            "Algorithms",
596            "Experiment",
597            "Experiments",
598            "Evaluation",
599            "Performance",
600            "Application",
601            "Applications",
602            // Common sentence starters
603            "However",
604            "Therefore",
605            "Furthermore",
606            "Moreover",
607            "Although",
608            "Thus",
609            "Hence",
610            "Similarly",
611            "Additionally",
612            "Nevertheless",
613            "Consequently",
614            "Specifically",
615            "Generally",
616            "Particularly",
617            "Especially",
618            "Indeed",
619            "Actually",
620            "Obviously",
621            "Clearly",
622            "Certainly",
623            "Probably",
624            "Possibly",
625            "Perhaps",
626            "Rather",
627            "Instead",
628            "Otherwise",
629            "Finally",
630            "Initially",
631            "Ultimately",
632            "Essentially",
633            "Basically",
634            // Noise from PDFs
635            "Note",
636            "Notes",
637            "Example",
638            "Examples",
639            "Definition",
640            "Theorem",
641            "Proof",
642            "Lemma",
643            "Proposition",
644            "Corollary",
645            "Remark",
646            "Case",
647            "Cases",
648            "Step",
649            "Steps",
650            "Part",
651            "Parts",
652            "Item",
653            "Items",
654            "Point",
655            "Points",
656            "Fact",
657            "Facts",
658            "First",
659            "Second",
660            "Third",
661            "Fourth",
662            "Fifth",
663            "Next",
664            "Previous",
665            "Following",
666            "Preceding",
667            "Here",
668            "There",
669            "Now",
670            "Then",
671            "Today",
672            "Yesterday",
673            "Tomorrow",
674            // Very short words (likely noise)
675            "So",
676            "No",
677            "Yes",
678            "Ok",
679            "Oh",
680            "Ah",
681            "Eh",
682            "Um",
683            "Uh",
684            "Re",
685            "Vs",
686            "Et",
687            "Al",
688            // Common academic phrases as single words
689            "Based",
690            "According",
691            "Regarding",
692            "Concernoing",
693            "Given",
694            "Assuming",
695            "Suppose",
696            "Consider",
697            "Considering",
698            "Such",
699            "Many",
700            "Much",
701            "Most",
702            "Some",
703            "Any",
704            "Each",
705            "Every",
706            "Both",
707            "All",
708            "Other",
709            "Another",
710            "Same",
711            "Different",
712            "Various",
713            "Several",
714            // More noise
715            "Published",
716            "Received",
717            "Accepted",
718            "Revised",
719            "Available",
720            "Online",
721            "Copyright",
722            "Rights",
723            "Reserved",
724            "Author",
725            "Authors",
726            "Corresponding",
727            "Email",
728            "Address",
729            "University",
730            "Department",
731            "Institute",
732            "Center",
733            "College",
734            "School",
735            "Lab",
736            // Additional noise from PDFs
737            "Fig",
738            "Eq",
739            "Eqs",
740            "Ref",
741            "Refs",
742            "Tab",
743            "Sec",
744            "App",
745            "Vol",
746            "No",
747            "Pp",
748            "Ed",
749            "Eds",
750            "Inc",
751            "Ltd",
752            "Corp",
753            "Co",
754            "Jr",
755            "Sr",
756            "Dr",
757            "Mr",
758            "Mrs",
759            "Ms",
760            "Prof",
761            // Quantifiers and modifiers
762            "More",
763            "Less",
764            "Few",
765            "Little",
766            "New",
767            "Old",
768            "Good",
769            "Bad",
770            "Large",
771            "Small",
772            "High",
773            "Low",
774            "Long",
775            "Short",
776            "Full",
777            "Empty",
778            "True",
779            "False",
780            "Real",
781            "Main",
782            // Technical noise
783            "Input",
784            "Output",
785            "Function",
786            "Variable",
787            "Parameter",
788            "Value",
789            "Type",
790            "Class",
791            "Object",
792            "Array",
793            "List",
794            "Set",
795            "Map",
796            "Key",
797            "Node",
798            "Edge",
799            "Graph",
800            "Tree",
801            "Network",
802            "Layer",
803            "Hidden",
804            "Embedding",
805            "Vector",
806            "Matrix",
807            "Tensor",
808            "Loss",
809            "Error",
810            "Accuracy",
811            "Score",
812            "Rate",
813            "Ratio",
814            "Mean",
815            "Average",
816            "Sum",
817            "Total",
818            "Max",
819            "Min",
820            "Like",
821            "Net",
822            "Core",
823            "Base",
824            "Top",
825            "Bottom",
826            "Left",
827            "Right",
828        ];
829        words.iter().copied().collect()
830    });
831
832    common_words.contains(word)
833}
834
835#[cfg(test)]
836#[allow(deprecated)]
837mod tests {
838    use super::*;
839
840    #[test]
841    fn test_rule_based_ner() {
842        let ner = RuleBasedNER::new();
843        let text =
844            "John Smith works at Acme Corp. He earns $100,000 per year. The meeting is on 2024-01-15.";
845        let entities = ner.extract_entities(text, None).unwrap();
846
847        // Should extract capitalized names, money, dates
848        assert!(!entities.is_empty());
849        assert!(entities.iter().any(|e| e.text == "John Smith"));
850        assert!(entities.iter().any(|e| e.entity_type == EntityType::Money));
851        assert!(entities.iter().any(|e| e.entity_type == EntityType::Date));
852    }
853
854    #[test]
855    fn test_common_word_filtering() {
856        let ner = RuleBasedNER::new();
857        let text = "The Figure shows the Results. However, the Introduction was clear.";
858        let entities = ner.extract_entities(text, None).unwrap();
859
860        // Common words should be filtered out
861        assert!(!entities.iter().any(|e| e.text == "The"));
862        assert!(!entities.iter().any(|e| e.text == "Figure"));
863        assert!(!entities.iter().any(|e| e.text == "Results"));
864        assert!(!entities.iter().any(|e| e.text == "However"));
865        assert!(!entities.iter().any(|e| e.text == "Introduction"));
866    }
867
868    #[test]
869    fn test_without_filtering() {
870        let ner = RuleBasedNER::without_filtering();
871        // Use text where common words are NOT followed by other capitalized words
872        // (the pattern greedily matches multi-word phrases like "The Figure")
873        let text = "The cat sat on Figure today.";
874        let entities = ner.extract_entities(text, None).unwrap();
875
876        // Without filtering, common words should be included as standalone entities
877        assert!(
878            entities.iter().any(|e| e.text == "The"),
879            "Expected 'The' in entities: {:?}",
880            entities
881        );
882        assert!(
883            entities.iter().any(|e| e.text == "Figure"),
884            "Expected 'Figure' in entities: {:?}",
885            entities
886        );
887    }
888
889    #[test]
890    fn test_percentage_extraction() {
891        let ner = RuleBasedNER::new();
892        let text = "Accuracy improved by 15.5% and recall by 20%.";
893        let entities = ner.extract_entities(text, None).unwrap();
894
895        let percents: Vec<_> = entities
896            .iter()
897            .filter(|e| e.entity_type == EntityType::Percent)
898            .collect();
899        assert_eq!(percents.len(), 2);
900    }
901
902    #[test]
903    fn test_model_interface() {
904        let ner = RuleBasedNER::new();
905        assert!(ner.is_available());
906        assert_eq!(ner.name(), "rule");
907        assert!(!ner.supported_types().is_empty());
908    }
909}
anno/backends/rule.rs

anno/backends/
rule.rs