scirs2_text/information_extraction/
pipeline.rs

1//! Information extraction pipelines and result containers
2
3use super::confidence::ConfidenceScorer;
4use super::coreference::{CoreferenceChain, CoreferenceResolver};
5use super::entities::Entity;
6use super::extractors::{KeyPhraseExtractor, PatternExtractor, RuleBasedNER};
7use super::linking::{EntityLinker, LinkedEntity};
8use super::relations::{Relation, RelationExtractor};
9use super::temporal::TemporalExtractor;
10use crate::error::Result;
11use crate::tokenize::WordTokenizer;
12use std::collections::HashMap;
13
14/// Container for all extracted information
15#[derive(Debug)]
16pub struct ExtractedInformation {
17    /// All entities extracted from the text
18    pub entities: Vec<Entity>,
19    /// Key phrases with importance scores
20    pub key_phrases: Vec<(String, f64)>,
21    /// Patterns found in the text organized by pattern type
22    pub patterns: HashMap<String, Vec<String>>,
23    /// Relations found between entities
24    pub relations: Vec<Relation>,
25}
26
27/// Enhanced container for all extracted information
28#[derive(Debug)]
29pub struct AdvancedExtractedInformation {
30    /// All entities extracted from the text
31    pub entities: Vec<Entity>,
32    /// Entities linked to knowledge base
33    pub linked_entities: Vec<LinkedEntity>,
34    /// Key phrases with importance scores
35    pub key_phrases: Vec<(String, f64)>,
36    /// Patterns found in the text organized by pattern type
37    pub patterns: HashMap<String, Vec<String>>,
38    /// Relations found between entities
39    pub relations: Vec<Relation>,
40    /// Coreference chains
41    pub coreference_chains: Vec<CoreferenceChain>,
42}
43
44/// Comprehensive information extraction pipeline
45pub struct InformationExtractionPipeline {
46    ner: RuleBasedNER,
47    key_phrase_extractor: KeyPhraseExtractor,
48    pattern_extractor: PatternExtractor,
49    relation_extractor: RelationExtractor,
50}
51
52impl Default for InformationExtractionPipeline {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl InformationExtractionPipeline {
59    /// Create a new extraction pipeline
60    pub fn new() -> Self {
61        Self {
62            ner: RuleBasedNER::new(),
63            key_phrase_extractor: KeyPhraseExtractor::new(),
64            pattern_extractor: PatternExtractor::new(),
65            relation_extractor: RelationExtractor::new(),
66        }
67    }
68
69    /// Set the NER component
70    pub fn with_ner(mut self, ner: RuleBasedNER) -> Self {
71        self.ner = ner;
72        self
73    }
74
75    /// Set the key phrase extractor
76    pub fn with_key_phrase_extractor(mut self, extractor: KeyPhraseExtractor) -> Self {
77        self.key_phrase_extractor = extractor;
78        self
79    }
80
81    /// Set the pattern extractor
82    pub fn with_pattern_extractor(mut self, extractor: PatternExtractor) -> Self {
83        self.pattern_extractor = extractor;
84        self
85    }
86
87    /// Set the relation extractor
88    pub fn with_relation_extractor(mut self, extractor: RelationExtractor) -> Self {
89        self.relation_extractor = extractor;
90        self
91    }
92
93    /// Extract all information from text
94    pub fn extract(&self, text: &str) -> Result<ExtractedInformation> {
95        let tokenizer = WordTokenizer::default();
96
97        let entities = self.ner.extract_entities(text)?;
98        let key_phrases = self.key_phrase_extractor.extract(text, &tokenizer)?;
99        let patterns = self.pattern_extractor.extract(text)?;
100        let relations = self.relation_extractor.extract_relations(text, &entities)?;
101
102        Ok(ExtractedInformation {
103            entities,
104            key_phrases,
105            patterns,
106            relations,
107        })
108    }
109}
110
111/// Enhanced information extraction pipeline with advanced features
112pub struct AdvancedExtractionPipeline {
113    ner: RuleBasedNER,
114    key_phrase_extractor: KeyPhraseExtractor,
115    pattern_extractor: PatternExtractor,
116    relation_extractor: RelationExtractor,
117    temporal_extractor: TemporalExtractor,
118    entity_linker: EntityLinker,
119    coreference_resolver: CoreferenceResolver,
120    confidence_scorer: ConfidenceScorer,
121}
122
123impl Default for AdvancedExtractionPipeline {
124    fn default() -> Self {
125        Self::new()
126    }
127}
128
129impl AdvancedExtractionPipeline {
130    /// Create new advanced extraction pipeline
131    pub fn new() -> Self {
132        Self {
133            ner: RuleBasedNER::new(),
134            key_phrase_extractor: KeyPhraseExtractor::new(),
135            pattern_extractor: PatternExtractor::new(),
136            relation_extractor: RelationExtractor::new(),
137            temporal_extractor: TemporalExtractor::new(),
138            entity_linker: EntityLinker::new(),
139            coreference_resolver: CoreferenceResolver::new(),
140            confidence_scorer: ConfidenceScorer::new(),
141        }
142    }
143
144    /// Configure components
145    pub fn with_ner(mut self, ner: RuleBasedNER) -> Self {
146        self.ner = ner;
147        self
148    }
149
150    /// Configure the entity linker component
151    pub fn with_entity_linker(mut self, linker: EntityLinker) -> Self {
152        self.entity_linker = linker;
153        self
154    }
155
156    /// Extract comprehensive information with advanced features
157    pub fn extract_advanced(&self, text: &str) -> Result<AdvancedExtractedInformation> {
158        let tokenizer = WordTokenizer::default();
159
160        // Basic extractions
161        let mut entities = self.ner.extract_entities(text)?;
162        let temporal_entities = self.temporal_extractor.extract(text)?;
163        entities.extend(temporal_entities);
164
165        // Enhance confidence scores
166        for entity in &mut entities {
167            entity.confidence = self.confidence_scorer.score_entity(entity, text, 50);
168        }
169
170        let key_phrases = self.key_phrase_extractor.extract(text, &tokenizer)?;
171        let patterns = self.pattern_extractor.extract(text)?;
172        let relations = self.relation_extractor.extract_relations(text, &entities)?;
173
174        // Advanced extractions
175        let linked_entities = self.entity_linker.link_entities(&mut entities)?;
176        let coreference_chains = self.coreference_resolver.resolve(text, &entities)?;
177
178        Ok(AdvancedExtractedInformation {
179            entities,
180            linked_entities,
181            key_phrases,
182            patterns,
183            relations,
184            coreference_chains,
185        })
186    }
187}