scirs2_text/information_extraction/
pipeline.rs1use super::confidence::ConfidenceScorer;
4use super::coreference::{CoreferenceChain, CoreferenceResolver};
5use super::entities::Entity;
6use super::extractors::{KeyPhraseExtractor, PatternExtractor, RuleBasedNER};
7use super::linking::{EntityLinker, LinkedEntity};
8use super::relations::{Relation, RelationExtractor};
9use super::temporal::TemporalExtractor;
10use crate::error::Result;
11use crate::tokenize::WordTokenizer;
12use std::collections::HashMap;
13
14#[derive(Debug)]
16pub struct ExtractedInformation {
17 pub entities: Vec<Entity>,
19 pub key_phrases: Vec<(String, f64)>,
21 pub patterns: HashMap<String, Vec<String>>,
23 pub relations: Vec<Relation>,
25}
26
27#[derive(Debug)]
29pub struct AdvancedExtractedInformation {
30 pub entities: Vec<Entity>,
32 pub linked_entities: Vec<LinkedEntity>,
34 pub key_phrases: Vec<(String, f64)>,
36 pub patterns: HashMap<String, Vec<String>>,
38 pub relations: Vec<Relation>,
40 pub coreference_chains: Vec<CoreferenceChain>,
42}
43
44pub struct InformationExtractionPipeline {
46 ner: RuleBasedNER,
47 key_phrase_extractor: KeyPhraseExtractor,
48 pattern_extractor: PatternExtractor,
49 relation_extractor: RelationExtractor,
50}
51
52impl Default for InformationExtractionPipeline {
53 fn default() -> Self {
54 Self::new()
55 }
56}
57
58impl InformationExtractionPipeline {
59 pub fn new() -> Self {
61 Self {
62 ner: RuleBasedNER::new(),
63 key_phrase_extractor: KeyPhraseExtractor::new(),
64 pattern_extractor: PatternExtractor::new(),
65 relation_extractor: RelationExtractor::new(),
66 }
67 }
68
69 pub fn with_ner(mut self, ner: RuleBasedNER) -> Self {
71 self.ner = ner;
72 self
73 }
74
75 pub fn with_key_phrase_extractor(mut self, extractor: KeyPhraseExtractor) -> Self {
77 self.key_phrase_extractor = extractor;
78 self
79 }
80
81 pub fn with_pattern_extractor(mut self, extractor: PatternExtractor) -> Self {
83 self.pattern_extractor = extractor;
84 self
85 }
86
87 pub fn with_relation_extractor(mut self, extractor: RelationExtractor) -> Self {
89 self.relation_extractor = extractor;
90 self
91 }
92
93 pub fn extract(&self, text: &str) -> Result<ExtractedInformation> {
95 let tokenizer = WordTokenizer::default();
96
97 let entities = self.ner.extract_entities(text)?;
98 let key_phrases = self.key_phrase_extractor.extract(text, &tokenizer)?;
99 let patterns = self.pattern_extractor.extract(text)?;
100 let relations = self.relation_extractor.extract_relations(text, &entities)?;
101
102 Ok(ExtractedInformation {
103 entities,
104 key_phrases,
105 patterns,
106 relations,
107 })
108 }
109}
110
111pub struct AdvancedExtractionPipeline {
113 ner: RuleBasedNER,
114 key_phrase_extractor: KeyPhraseExtractor,
115 pattern_extractor: PatternExtractor,
116 relation_extractor: RelationExtractor,
117 temporal_extractor: TemporalExtractor,
118 entity_linker: EntityLinker,
119 coreference_resolver: CoreferenceResolver,
120 confidence_scorer: ConfidenceScorer,
121}
122
123impl Default for AdvancedExtractionPipeline {
124 fn default() -> Self {
125 Self::new()
126 }
127}
128
129impl AdvancedExtractionPipeline {
130 pub fn new() -> Self {
132 Self {
133 ner: RuleBasedNER::new(),
134 key_phrase_extractor: KeyPhraseExtractor::new(),
135 pattern_extractor: PatternExtractor::new(),
136 relation_extractor: RelationExtractor::new(),
137 temporal_extractor: TemporalExtractor::new(),
138 entity_linker: EntityLinker::new(),
139 coreference_resolver: CoreferenceResolver::new(),
140 confidence_scorer: ConfidenceScorer::new(),
141 }
142 }
143
144 pub fn with_ner(mut self, ner: RuleBasedNER) -> Self {
146 self.ner = ner;
147 self
148 }
149
150 pub fn with_entity_linker(mut self, linker: EntityLinker) -> Self {
152 self.entity_linker = linker;
153 self
154 }
155
156 pub fn extract_advanced(&self, text: &str) -> Result<AdvancedExtractedInformation> {
158 let tokenizer = WordTokenizer::default();
159
160 let mut entities = self.ner.extract_entities(text)?;
162 let temporal_entities = self.temporal_extractor.extract(text)?;
163 entities.extend(temporal_entities);
164
165 for entity in &mut entities {
167 entity.confidence = self.confidence_scorer.score_entity(entity, text, 50);
168 }
169
170 let key_phrases = self.key_phrase_extractor.extract(text, &tokenizer)?;
171 let patterns = self.pattern_extractor.extract(text)?;
172 let relations = self.relation_extractor.extract_relations(text, &entities)?;
173
174 let linked_entities = self.entity_linker.link_entities(&mut entities)?;
176 let coreference_chains = self.coreference_resolver.resolve(text, &entities)?;
177
178 Ok(AdvancedExtractedInformation {
179 entities,
180 linked_entities,
181 key_phrases,
182 patterns,
183 relations,
184 coreference_chains,
185 })
186 }
187}