1use crate::offset::TextSpan;
31use crate::{Entity, EntityType, Model, Result};
32use once_cell::sync::Lazy;
33use regex::Regex;
34
35#[deprecated(
40 since = "0.1.0",
41 note = "Use RegexNER (no gazetteers) or ML backends (BERT ONNX). Will be removed in 1.0."
42)]
43pub struct RuleBasedNER {
44 min_confidence: f64,
46 filter_common: bool,
48}
49
50#[allow(deprecated)]
51impl RuleBasedNER {
52 pub fn new() -> Self {
54 Self {
55 min_confidence: 0.3, filter_common: true,
57 }
58 }
59
60 #[must_use]
62 pub fn with_min_confidence(min_confidence: f64) -> Self {
63 Self {
64 min_confidence,
65 filter_common: true,
66 }
67 }
68
69 #[allow(dead_code)]
71 pub fn without_filtering() -> Self {
72 Self {
73 min_confidence: 0.3,
74 filter_common: false,
75 }
76 }
77}
78
79#[allow(deprecated)]
80impl Default for RuleBasedNER {
81 fn default() -> Self {
82 Self::new()
83 }
84}
85
86#[allow(deprecated)]
87impl Model for RuleBasedNER {
88 fn extract_entities(&self, text: &str, _language: Option<&str>) -> Result<Vec<Entity>> {
89 let mut entities = Vec::new();
90
91 static KNOWN_ORGS: Lazy<Regex> = Lazy::new(|| {
99 Regex::new(r"\b(?:NASA|FBI|CIA|NSA|NIH|FDA|CDC|EPA|WHO|NATO|UN|EU|IMF|WTO|CERN|MIT|UCLA|DARPA|OECD|OPEC|IEEE|ACM|AWS|GCP|IBM|HP|AMD|ARM|NVIDIA|Intel|Apple|Google|Microsoft|Amazon|Meta|OpenAI|Anthropic|DeepMind|Pfizer|Moderna|Rivian|BYD|Netflix|Uber|Airbnb|NeurIPS|ICML|ICLR|CVPR|ACL|EMNLP|NAACL|IPCC|SEC|FCC|DOJ|DOE|DOD|USDA|HUD|IRS|FEMA|OSHA|NOAA|NSF|USPTO|FTC|NIST|DOT|VA|SSA|SBA|FAA|TSA|ICE|CBP|USCIS|NFL|NBA|MLB|NHL|MLS|FIFA|UEFA|IOC|NCAA|PGA|ATP|WTA|UFC|WWE|ESPN|LEICESTERSHIRE|DERBYSHIRE|YORKSHIRE|SURREY|ESSEX|WARWICKSHIRE|SUSSEX|MIDDLESEX|HAMPSHIRE|SOMERSET|KENT|LANCASHIRE|GLOUCESTERSHIRE|NOTTINGHAMSHIRE|NORTHAMPTONSHIRE|WORCESTERSHIRE|DURHAM)\b")
101 .expect("Failed to compile known orgs pattern")
102 });
103
104 for cap in KNOWN_ORGS.find_iter(text) {
105 let span = TextSpan::from_bytes(text, cap.start(), cap.end());
106 entities.push(Entity::new(
107 cap.as_str(),
108 EntityType::Organization,
109 span.char_start,
110 span.char_end,
111 0.95, ));
113 }
114
115 static ORG_PATTERN: Lazy<Regex> = Lazy::new(|| {
118 Regex::new(r"\b[A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\s+(?:Inc\.?|Corp\.?|Corporation|Ltd\.?|LLC|GmbH|University|Institute|Foundation|Laboratory|Labs?|Company|Technologies|Systems|Research|Group|Partners|Associates|Agency|Commission|Court|Council|Board|Committee|Organization|Organisation|Bank|Reserve|Museum)\b")
119 .expect("Failed to compile org pattern")
120 });
121
122 for cap in ORG_PATTERN.find_iter(text) {
123 let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
124 if entities
126 .iter()
127 .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
128 {
129 continue;
130 }
131 let text_str = strip_leading_article(cap.as_str());
132 let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
133 let span = TextSpan::from_bytes(text, start_adj, cap.end());
134 entities.push(Entity::new(
135 text_str,
136 EntityType::Organization,
137 span.char_start,
138 span.char_end,
139 0.85, ));
141 }
142
143 static LOCATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
145 Regex::new(r"\b(?:New\s+York(?:\s+City)?|San\s+Francisco|Los\s+Angeles|Washington(?:\s+D\.?C\.?)?|Tokyo\s+Bay|United\s+States|United\s+Kingdom|European\s+Union|Asia-Pacific|North\s+America|South\s+America|Atlantic\s+Ocean|Pacific\s+Ocean|Amazon\s+River|Tokyo|Berlin|Paris|London|Beijing|Shanghai|Mumbai|Sydney|Moscow|Dubai|Seoul|Singapore|Hong\s+Kong|Brazil|Peru|Colombia|China|Japan|Germany|France|Italy|Spain|Canada|Australia|India|Russia|Mexico|Argentina|Chile|Ukraine|California|Texas|Florida|Illinois|Seattle|Chicago|Boston|Atlanta|Denver|Phoenix|Portland|Miami|Cupertino|Redmond|Wuhan|Geneva)\b")
146 .expect("Failed to compile location pattern")
147 });
148
149 for cap in LOCATION_PATTERN.find_iter(text) {
150 let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
151 if entities
153 .iter()
154 .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
155 {
156 continue;
157 }
158 let text_str = strip_leading_article(cap.as_str());
159 let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
160 let span = TextSpan::from_bytes(text, start_adj, cap.end());
161 entities.push(Entity::new(
162 text_str,
163 EntityType::Location,
164 span.char_start,
165 span.char_end,
166 0.9,
167 ));
168 }
169
170 static PERSON_PATTERN: Lazy<Regex> = Lazy::new(|| {
174 Regex::new(r"(?:Dr\.|Mr\.|Mrs\.|Ms\.|Prof\.|Chairman|CEO|President|Director|Justice|General|Commissioner|Coach|Governor|Senator|Mayor)\s+[A-Z][a-z]+(?:\s+[a-z]+\s+[A-Z][a-z]+|\s+[A-Z][a-z]+)?|[A-Z][a-z]+\s+(?:et\s+al\.?)|[A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?")
175 .expect("Failed to compile person pattern")
176 });
177
178 for cap in PERSON_PATTERN.find_iter(text) {
179 let mut text_str = cap.as_str();
180 text_str = strip_leading_article(text_str);
182
183 let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
184 if entities
186 .iter()
187 .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
188 {
189 continue;
190 }
191 if self.filter_common
193 && (is_common_capitalized_word(text_str) || starts_with_noise(text_str))
194 {
195 continue;
196 }
197 let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
198 let span = TextSpan::from_bytes(text, start_adj, cap.end());
199 entities.push(Entity::new(
200 text_str,
201 EntityType::Person,
202 span.char_start,
203 span.char_end,
204 0.7,
205 ));
206 }
207
208 static CAPITALIZED_PATTERN: Lazy<Regex> = Lazy::new(|| {
210 Regex::new(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b")
211 .expect("Failed to compile capitalized pattern")
212 });
213
214 for cap in CAPITALIZED_PATTERN.find_iter(text) {
215 let mut text_str = cap.as_str();
216 text_str = strip_leading_article(text_str);
218 if text_str.is_empty() {
219 continue;
220 }
221 if self.filter_common
223 && (is_common_capitalized_word(text_str) || starts_with_noise(text_str))
224 {
225 continue;
226 }
227 let cap_span = TextSpan::from_bytes(text, cap.start(), cap.end());
228 if entities
230 .iter()
231 .any(|e| spans_overlap(e.start, e.end, cap_span.char_start, cap_span.char_end))
232 {
233 continue;
234 }
235 let entity_type = infer_entity_type(text_str);
237 let start_adj = cap.start() + (cap.as_str().len() - text_str.len());
238 let span = TextSpan::from_bytes(text, start_adj, cap.end());
239 entities.push(Entity::new(
240 text_str,
241 entity_type,
242 span.char_start,
243 span.char_end,
244 0.4, ));
246 }
247
248 static DATE_PATTERN: Lazy<Regex> = Lazy::new(|| {
250 Regex::new(r"\b(?:\d{4}-\d{2}-\d{2}|\d{1,2}/\d{1,2}/\d{4}|(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:,\s*\d{4})?|\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)(?:\s+\d{4})?|(?:Q[1-4]|(?:19|20)\d{2}))\b")
251 .expect("Failed to compile date pattern")
252 });
253
254 for date_match in DATE_PATTERN.find_iter(text) {
255 let span = TextSpan::from_bytes(text, date_match.start(), date_match.end());
256 if entities
258 .iter()
259 .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
260 {
261 continue;
262 }
263 entities.push(Entity::new(
264 date_match.as_str(),
265 EntityType::Date,
266 span.char_start,
267 span.char_end,
268 0.8,
269 ));
270 }
271
272 static MONEY_PATTERN: Lazy<Regex> = Lazy::new(|| {
274 Regex::new(r"\$[\d,]+\.?\d*\s*(?:billion|million|thousand|B|M|K)?|\d+\.?\d*\s*(?:dollars?|USD|EUR|GBP|billion|million)")
275 .expect("Failed to compile money pattern")
276 });
277
278 for money_match in MONEY_PATTERN.find_iter(text) {
279 let span = TextSpan::from_bytes(text, money_match.start(), money_match.end());
280 if entities
282 .iter()
283 .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
284 {
285 continue;
286 }
287 entities.push(Entity::new(
288 money_match.as_str(),
289 EntityType::Money,
290 span.char_start,
291 span.char_end,
292 0.8,
293 ));
294 }
295
296 static PERCENT_PATTERN: Lazy<Regex> =
298 Lazy::new(|| Regex::new(r"\d+\.?\d*\s*%").expect("Failed to compile percent pattern"));
299
300 for percent_match in PERCENT_PATTERN.find_iter(text) {
301 let span = TextSpan::from_bytes(text, percent_match.start(), percent_match.end());
302 if entities
304 .iter()
305 .any(|e| spans_overlap(e.start, e.end, span.char_start, span.char_end))
306 {
307 continue;
308 }
309 entities.push(Entity::new(
310 percent_match.as_str(),
311 EntityType::Percent,
312 span.char_start,
313 span.char_end,
314 0.8,
315 ));
316 }
317
318 entities.retain(|e| e.confidence >= self.min_confidence);
320
321 entities.retain(|e| !e.text.starts_with("The "));
323
324 Ok(entities)
325 }
326
327 fn supported_types(&self) -> Vec<EntityType> {
328 vec![
329 EntityType::Person,
330 EntityType::Organization,
331 EntityType::Location,
332 EntityType::Date,
333 EntityType::Money,
334 EntityType::Percent,
335 EntityType::Other("unknown".to_string()),
336 ]
337 }
338
339 fn is_available(&self) -> bool {
340 true }
342
343 fn name(&self) -> &'static str {
344 "rule"
345 }
346
347 fn description(&self) -> &'static str {
348 "Rule-based NER using regex patterns and heuristics"
349 }
350}
351
352fn spans_overlap(s1_start: usize, s1_end: usize, s2_start: usize, s2_end: usize) -> bool {
354 !(s1_end <= s2_start || s2_end <= s1_start)
355}
356
357fn strip_leading_article(text: &str) -> &str {
359 text.strip_prefix("The ")
360 .or_else(|| text.strip_prefix("A "))
361 .or_else(|| text.strip_prefix("An "))
362 .unwrap_or(text)
363}
364
365fn starts_with_noise(text: &str) -> bool {
367 let noise_starts = [
369 "According",
370 "Based",
371 "Given",
372 "Following",
373 "Regarding",
374 "Attention Is",
375 "All You", ];
377 noise_starts.iter().any(|n| text.starts_with(n))
378}
379
380fn infer_entity_type(text: &str) -> EntityType {
384 let lower = text.to_lowercase();
385 let words: Vec<&str> = text.split_whitespace().collect();
386
387 if words.len() == 2 || words.len() == 3 {
389 if words
391 .iter()
392 .all(|w| w.chars().next().map(|c| c.is_uppercase()).unwrap_or(false))
393 {
394 if is_common_surname(words[0])
396 || (words.len() > 1 && words.last().is_some_and(|w| is_common_surname(w)))
397 {
398 return EntityType::Person;
399 }
400 }
401 }
402
403 if words.len() == 1 && is_common_surname(text) {
405 return EntityType::Person;
406 }
407
408 if lower.contains("network")
410 || lower.contains("model")
411 || lower.contains("algorithm")
412 || lower.contains("learning")
413 || lower.contains("neural")
414 || lower.contains("transformer")
415 {
416 return EntityType::Other("concept".to_string());
417 }
418
419 if text.len() >= 2 && text.len() <= 5 && text.chars().all(|c| c.is_uppercase()) {
421 return EntityType::Other("acronym".to_string());
422 }
423
424 EntityType::Other("unknown".to_string())
425}
426
427fn is_common_surname(word: &str) -> bool {
429 static COMMON_SURNAMES: &[&str] = &[
430 "Wang", "Li", "Zhang", "Liu", "Chen", "Yang", "Huang", "Zhao", "Wu", "Zhou", "Xu", "Sun",
432 "Ma", "Zhu", "Hu", "Guo", "Lin", "He", "Gao", "Luo", "Zheng", "Liang", "Xie", "Tang",
433 "Han", "Feng", "Deng", "Cao", "Peng", "Xiao", "Jiang", "Cheng", "Yuan", "Lu", "Pan",
434 "Ding", "Wei", "Ren", "Shao", "Qian", "Kim", "Lee", "Park", "Choi", "Jung", "Kang", "Cho", "Yoon", "Jang", "Lim",
436 "Tanaka", "Suzuki", "Yamamoto", "Watanabe", "Sato", "Ito", "Nakamura",
438 "Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Wilson", "Moore",
440 "Taylor", "Anderson", "Thomas", "White", "Harris",
441 ];
442 COMMON_SURNAMES.contains(&word)
443}
444
445fn is_common_capitalized_word(word: &str) -> bool {
450 use std::collections::HashSet;
452 use std::sync::OnceLock;
453
454 static COMMON_WORDS: OnceLock<HashSet<&'static str>> = OnceLock::new();
455
456 let common_words = COMMON_WORDS.get_or_init(|| {
457 let words: &[&str] = &[
458 "The",
460 "A",
461 "An",
462 "This",
463 "That",
464 "These",
465 "Those",
466 "I",
467 "You",
468 "He",
469 "She",
470 "It",
471 "We",
472 "They",
473 "My",
474 "Your",
475 "His",
476 "Her",
477 "Its",
478 "Our",
479 "Their",
480 "What",
481 "Which",
482 "Who",
483 "Whom",
484 "And",
486 "Or",
487 "But",
488 "If",
489 "When",
490 "Where",
491 "Why",
492 "How",
493 "As",
494 "At",
495 "By",
496 "For",
497 "From",
498 "In",
499 "Into",
500 "Of",
501 "On",
502 "To",
503 "With",
504 "About",
505 "After",
506 "Against",
507 "Before",
508 "Between",
509 "During",
510 "Through",
511 "Under",
512 "Over",
513 "Above",
514 "Below",
515 "Since",
516 "Until",
517 "Upon",
518 "Is",
520 "Are",
521 "Was",
522 "Were",
523 "Be",
524 "Been",
525 "Being",
526 "Have",
527 "Has",
528 "Had",
529 "Do",
530 "Does",
531 "Did",
532 "Will",
533 "Would",
534 "Could",
535 "Should",
536 "May",
537 "Might",
538 "Can",
539 "Cannot",
540 "Let",
541 "Get",
542 "Got",
543 "Make",
544 "Made",
545 "Take",
546 "Took",
547 "Give",
548 "Gave",
549 "See",
550 "Saw",
551 "Know",
552 "Knew",
553 "Think",
554 "Thought",
555 "Want",
556 "Use",
557 "Used",
558 "Using",
559 "Find",
560 "Figure",
562 "Table",
563 "Section",
564 "Chapter",
565 "Page",
566 "Abstract",
567 "Introduction",
568 "Conclusion",
569 "Conclusions",
570 "Discussion",
571 "Method",
572 "Methods",
573 "Results",
574 "References",
575 "Appendix",
576 "Acknowledgments",
577 "Background",
578 "Related",
579 "Work",
580 "Paper",
581 "Papers",
582 "Study",
583 "Studies",
584 "Research",
585 "Analysis",
586 "Data",
587 "Model",
588 "Models",
589 "Approach",
590 "Problem",
591 "Solution",
592 "System",
593 "Systems",
594 "Algorithm",
595 "Algorithms",
596 "Experiment",
597 "Experiments",
598 "Evaluation",
599 "Performance",
600 "Application",
601 "Applications",
602 "However",
604 "Therefore",
605 "Furthermore",
606 "Moreover",
607 "Although",
608 "Thus",
609 "Hence",
610 "Similarly",
611 "Additionally",
612 "Nevertheless",
613 "Consequently",
614 "Specifically",
615 "Generally",
616 "Particularly",
617 "Especially",
618 "Indeed",
619 "Actually",
620 "Obviously",
621 "Clearly",
622 "Certainly",
623 "Probably",
624 "Possibly",
625 "Perhaps",
626 "Rather",
627 "Instead",
628 "Otherwise",
629 "Finally",
630 "Initially",
631 "Ultimately",
632 "Essentially",
633 "Basically",
634 "Note",
636 "Notes",
637 "Example",
638 "Examples",
639 "Definition",
640 "Theorem",
641 "Proof",
642 "Lemma",
643 "Proposition",
644 "Corollary",
645 "Remark",
646 "Case",
647 "Cases",
648 "Step",
649 "Steps",
650 "Part",
651 "Parts",
652 "Item",
653 "Items",
654 "Point",
655 "Points",
656 "Fact",
657 "Facts",
658 "First",
659 "Second",
660 "Third",
661 "Fourth",
662 "Fifth",
663 "Next",
664 "Previous",
665 "Following",
666 "Preceding",
667 "Here",
668 "There",
669 "Now",
670 "Then",
671 "Today",
672 "Yesterday",
673 "Tomorrow",
674 "So",
676 "No",
677 "Yes",
678 "Ok",
679 "Oh",
680 "Ah",
681 "Eh",
682 "Um",
683 "Uh",
684 "Re",
685 "Vs",
686 "Et",
687 "Al",
688 "Based",
690 "According",
691 "Regarding",
692 "Concernoing",
693 "Given",
694 "Assuming",
695 "Suppose",
696 "Consider",
697 "Considering",
698 "Such",
699 "Many",
700 "Much",
701 "Most",
702 "Some",
703 "Any",
704 "Each",
705 "Every",
706 "Both",
707 "All",
708 "Other",
709 "Another",
710 "Same",
711 "Different",
712 "Various",
713 "Several",
714 "Published",
716 "Received",
717 "Accepted",
718 "Revised",
719 "Available",
720 "Online",
721 "Copyright",
722 "Rights",
723 "Reserved",
724 "Author",
725 "Authors",
726 "Corresponding",
727 "Email",
728 "Address",
729 "University",
730 "Department",
731 "Institute",
732 "Center",
733 "College",
734 "School",
735 "Lab",
736 "Fig",
738 "Eq",
739 "Eqs",
740 "Ref",
741 "Refs",
742 "Tab",
743 "Sec",
744 "App",
745 "Vol",
746 "No",
747 "Pp",
748 "Ed",
749 "Eds",
750 "Inc",
751 "Ltd",
752 "Corp",
753 "Co",
754 "Jr",
755 "Sr",
756 "Dr",
757 "Mr",
758 "Mrs",
759 "Ms",
760 "Prof",
761 "More",
763 "Less",
764 "Few",
765 "Little",
766 "New",
767 "Old",
768 "Good",
769 "Bad",
770 "Large",
771 "Small",
772 "High",
773 "Low",
774 "Long",
775 "Short",
776 "Full",
777 "Empty",
778 "True",
779 "False",
780 "Real",
781 "Main",
782 "Input",
784 "Output",
785 "Function",
786 "Variable",
787 "Parameter",
788 "Value",
789 "Type",
790 "Class",
791 "Object",
792 "Array",
793 "List",
794 "Set",
795 "Map",
796 "Key",
797 "Node",
798 "Edge",
799 "Graph",
800 "Tree",
801 "Network",
802 "Layer",
803 "Hidden",
804 "Embedding",
805 "Vector",
806 "Matrix",
807 "Tensor",
808 "Loss",
809 "Error",
810 "Accuracy",
811 "Score",
812 "Rate",
813 "Ratio",
814 "Mean",
815 "Average",
816 "Sum",
817 "Total",
818 "Max",
819 "Min",
820 "Like",
821 "Net",
822 "Core",
823 "Base",
824 "Top",
825 "Bottom",
826 "Left",
827 "Right",
828 ];
829 words.iter().copied().collect()
830 });
831
832 common_words.contains(word)
833}
834
835#[cfg(test)]
836#[allow(deprecated)]
837mod tests {
838 use super::*;
839
840 #[test]
841 fn test_rule_based_ner() {
842 let ner = RuleBasedNER::new();
843 let text =
844 "John Smith works at Acme Corp. He earns $100,000 per year. The meeting is on 2024-01-15.";
845 let entities = ner.extract_entities(text, None).unwrap();
846
847 assert!(!entities.is_empty());
849 assert!(entities.iter().any(|e| e.text == "John Smith"));
850 assert!(entities.iter().any(|e| e.entity_type == EntityType::Money));
851 assert!(entities.iter().any(|e| e.entity_type == EntityType::Date));
852 }
853
854 #[test]
855 fn test_common_word_filtering() {
856 let ner = RuleBasedNER::new();
857 let text = "The Figure shows the Results. However, the Introduction was clear.";
858 let entities = ner.extract_entities(text, None).unwrap();
859
860 assert!(!entities.iter().any(|e| e.text == "The"));
862 assert!(!entities.iter().any(|e| e.text == "Figure"));
863 assert!(!entities.iter().any(|e| e.text == "Results"));
864 assert!(!entities.iter().any(|e| e.text == "However"));
865 assert!(!entities.iter().any(|e| e.text == "Introduction"));
866 }
867
868 #[test]
869 fn test_without_filtering() {
870 let ner = RuleBasedNER::without_filtering();
871 let text = "The cat sat on Figure today.";
874 let entities = ner.extract_entities(text, None).unwrap();
875
876 assert!(
878 entities.iter().any(|e| e.text == "The"),
879 "Expected 'The' in entities: {:?}",
880 entities
881 );
882 assert!(
883 entities.iter().any(|e| e.text == "Figure"),
884 "Expected 'Figure' in entities: {:?}",
885 entities
886 );
887 }
888
889 #[test]
890 fn test_percentage_extraction() {
891 let ner = RuleBasedNER::new();
892 let text = "Accuracy improved by 15.5% and recall by 20%.";
893 let entities = ner.extract_entities(text, None).unwrap();
894
895 let percents: Vec<_> = entities
896 .iter()
897 .filter(|e| e.entity_type == EntityType::Percent)
898 .collect();
899 assert_eq!(percents.len(), 2);
900 }
901
902 #[test]
903 fn test_model_interface() {
904 let ner = RuleBasedNER::new();
905 assert!(ner.is_available());
906 assert_eq!(ner.name(), "rule");
907 assert!(!ner.supported_types().is_empty());
908 }
909}