Skip to main content

anno/backends/
event_extractor.rs

1//! Event extraction trait and implementations.
2//!
3//! Event extraction identifies event triggers (verbs/nouns that denote events)
4//! and their arguments (participants, time, place) following ACE event schema.
5//!
6//! # ACE Event Schema
7//!
8//! Events have:
9//! - **Trigger**: The word/phrase that denotes the event (e.g., "invaded", "announced")
10//! - **Type**: Event category (e.g., "conflict:attack", "movement:transport")
11//! - **Arguments**: Roles filled by entities (e.g., Agent, Patient, Time, Place)
12//!
13//! # Usage
14//!
15//! ```rust
16//! use anno::backends::event_extractor::{EventExtractor, RuleBasedEventExtractor};
17//!
18//! let extractor = RuleBasedEventExtractor::new();
19//! let events = extractor
20//!     .extract_events("Russia invaded Ukraine in 2022.", None)
21//!     .unwrap();
22//!
23//! for event in &events {
24//!     println!("Event: {} ({})", event.trigger, event.event_type);
25//!     for (role, entity) in &event.arguments {
26//!         println!("  {}: {}", role, entity.text);
27//!     }
28//! }
29//! ```
30//!
31//! # Research Background
32//!
33//! Based on ACE 2005 event ontology:
34//! - Life: be-born, marry, divorce, injure, die
35//! - Movement: transport
36//! - Transaction: transfer-ownership, transfer-money
37//! - Business: start-org, merge-org, declare-bankruptcy, end-org
38//! - Conflict: attack, demonstrate
39//! - Contact: meet, phone-write
40//! - Personnel: start-position, end-position, nominate, elect
41//! - Justice: arrest-jail, release-parole, trial-hearing, charge-indict, etc.
42
43use crate::{Entity, Result};
44
45/// Event with trigger and arguments following ACE schema.
46#[derive(Debug, Clone)]
47pub struct Event {
48    /// The trigger word/phrase (e.g., "invaded", "announced")
49    pub trigger: String,
50    /// Start character offset of trigger
51    pub trigger_start: usize,
52    /// End character offset of trigger
53    pub trigger_end: usize,
54    /// Event type (e.g., "conflict:attack", "movement:transport")
55    pub event_type: String,
56    /// Event arguments: (role, entity)
57    /// Common roles: Agent, Patient, Time, Place, Instrument
58    pub arguments: Vec<(String, Entity)>,
59    /// Confidence score (0.0-1.0)
60    pub confidence: f64,
61}
62
63impl Event {
64    /// Create a new event.
65    #[must_use]
66    pub fn new(
67        trigger: impl Into<String>,
68        trigger_start: usize,
69        trigger_end: usize,
70        event_type: impl Into<String>,
71    ) -> Self {
72        Self {
73            trigger: trigger.into(),
74            trigger_start,
75            trigger_end,
76            event_type: event_type.into(),
77            arguments: Vec::new(),
78            confidence: 1.0,
79        }
80    }
81
82    /// Add an argument to this event.
83    #[must_use]
84    pub fn with_argument(mut self, role: impl Into<String>, entity: Entity) -> Self {
85        self.arguments.push((role.into(), entity));
86        self
87    }
88
89    /// Set confidence score.
90    #[must_use]
91    pub fn with_confidence(mut self, confidence: f64) -> Self {
92        self.confidence = confidence.clamp(0.0, 1.0);
93        self
94    }
95}
96
97/// Trait for event extraction backends.
98///
99/// Event extraction identifies event triggers and their arguments.
100/// This is distinct from NER (which extracts entities) and relation extraction
101/// (which links entities). Events are structured occurrences with participants.
102pub trait EventExtractor: Send + Sync {
103    /// Extract events from text.
104    ///
105    /// # Arguments
106    ///
107    /// * `text` - Input text to extract events from
108    /// * `language` - Optional language hint (e.g., "en", "es")
109    ///
110    /// # Returns
111    ///
112    /// Vector of events with triggers and arguments.
113    fn extract_events(&self, text: &str, language: Option<&str>) -> Result<Vec<Event>>;
114
115    /// Get the extractor name/identifier.
116    fn name(&self) -> &'static str;
117
118    /// Get a description of the extractor.
119    fn description(&self) -> &'static str {
120        "Event extractor"
121    }
122}
123
124/// Rule-based event extractor using trigger patterns.
125///
126/// Uses simple pattern matching on event trigger words.
127/// Fast but limited coverage compared to neural methods.
128pub struct RuleBasedEventExtractor {
129    /// Minimum confidence threshold
130    threshold: f64,
131}
132
133impl RuleBasedEventExtractor {
134    /// Create a new rule-based event extractor.
135    #[must_use]
136    pub fn new() -> Self {
137        Self { threshold: 0.5 }
138    }
139
140    /// Create with custom confidence threshold.
141    #[must_use]
142    pub fn with_threshold(threshold: f64) -> Self {
143        Self {
144            threshold: threshold.clamp(0.0, 1.0),
145        }
146    }
147}
148
149impl Default for RuleBasedEventExtractor {
150    fn default() -> Self {
151        Self::new()
152    }
153}
154
155impl EventExtractor for RuleBasedEventExtractor {
156    fn extract_events(&self, text: &str, language: Option<&str>) -> Result<Vec<Event>> {
157        let mut events = Vec::new();
158
159        // Language-aware trigger patterns (ACE 2005 inspired)
160        // In production, this would use a more sophisticated lexicon
161        let lang_code = language.map(|l| l.split('-').next().unwrap_or(l).to_lowercase());
162
163        // Get language-specific patterns or fall back to English
164        let trigger_patterns: Vec<(&str, &str)> = match lang_code.as_deref() {
165            Some("es") => vec![
166                // Spanish conflict events
167                ("invadió", "conflict:attack"),
168                ("atacó", "conflict:attack"),
169                ("bombardeó", "conflict:attack"),
170                ("guerra", "conflict:attack"),
171                // Spanish movement
172                ("viajó", "movement:transport"),
173                ("movió", "movement:transport"),
174                ("desplegó", "movement:transport"),
175                // Spanish transaction
176                ("compró", "transaction:transfer-ownership"),
177                ("vendió", "transaction:transfer-ownership"),
178                ("adquirió", "transaction:transfer-ownership"),
179                // Spanish business
180                ("fundó", "business:start-org"),
181                ("inició", "business:start-org"),
182                ("fusionó", "business:merge-org"),
183                // Spanish communication
184                ("anunció", "communication:announce"),
185                ("declaró", "communication:announce"),
186                ("informó", "communication:announce"),
187                ("dijo", "communication:announce"),
188                // Spanish life
189                ("nació", "life:be-born"),
190                ("murió", "life:die"),
191                ("se casó", "life:marry"),
192                ("divorció", "life:divorce"),
193                // Spanish justice
194                ("arrestó", "justice:arrest-jail"),
195                ("acusó", "justice:charge-indict"),
196                ("condenó", "justice:convict"),
197                ("sentenció", "justice:sentence"),
198            ],
199            Some("fr") => vec![
200                // French conflict
201                ("envahi", "conflict:attack"),
202                ("attaqué", "conflict:attack"),
203                ("bombardé", "conflict:attack"),
204                ("guerre", "conflict:attack"),
205                // French movement
206                ("voyagé", "movement:transport"),
207                ("déplacé", "movement:transport"),
208                ("déployé", "movement:transport"),
209                // French transaction
210                ("acheté", "transaction:transfer-ownership"),
211                ("vendu", "transaction:transfer-ownership"),
212                ("acquis", "transaction:transfer-ownership"),
213                // French business
214                ("fondé", "business:start-org"),
215                ("créé", "business:start-org"),
216                ("fusionné", "business:merge-org"),
217                // French communication
218                ("annoncé", "communication:announce"),
219                ("déclaré", "communication:announce"),
220                ("rapporté", "communication:announce"),
221                ("dit", "communication:announce"),
222                // French life
223                ("né", "life:be-born"),
224                ("mort", "life:die"),
225                ("marié", "life:marry"),
226                ("divorcé", "life:divorce"),
227                // French justice
228                ("arrêté", "justice:arrest-jail"),
229                ("accusé", "justice:charge-indict"),
230                ("condamné", "justice:convict"),
231                ("condamné", "justice:sentence"),
232            ],
233            Some("de") => vec![
234                // German conflict
235                ("invadiert", "conflict:attack"),
236                ("angegriffen", "conflict:attack"),
237                ("bombardiert", "conflict:attack"),
238                ("krieg", "conflict:attack"),
239                // German movement
240                ("gereist", "movement:transport"),
241                ("bewegt", "movement:transport"),
242                ("verlegt", "movement:transport"),
243                // German transaction
244                ("gekauft", "transaction:transfer-ownership"),
245                ("verkauft", "transaction:transfer-ownership"),
246                ("erworben", "transaction:transfer-ownership"),
247                // German business
248                ("gegründet", "business:start-org"),
249                ("gestartet", "business:start-org"),
250                ("fusioniert", "business:merge-org"),
251                // German communication
252                ("angekündigt", "communication:announce"),
253                ("erklärt", "communication:announce"),
254                ("berichtet", "communication:announce"),
255                ("sagte", "communication:announce"),
256                // German life
257                ("geboren", "life:be-born"),
258                ("gestorben", "life:die"),
259                ("geheiratet", "life:marry"),
260                ("geschieden", "life:divorce"),
261                // German justice
262                ("verhaftet", "justice:arrest-jail"),
263                ("angeklagt", "justice:charge-indict"),
264                ("verurteilt", "justice:convict"),
265                ("verurteilt", "justice:sentence"),
266            ],
267            Some("zh") | Some("ja") | Some("ko") | Some("ar") | Some("ru") => {
268                // For CJK, Arabic, Russian: use English patterns as fallback
269                // In production, add language-specific patterns
270                vec![]
271            }
272            _ => vec![
273                // English (default)
274                ("invaded", "conflict:attack"),
275                ("attacked", "conflict:attack"),
276                ("bombed", "conflict:attack"),
277                ("fired", "conflict:attack"),
278                ("war", "conflict:attack"),
279                ("traveled", "movement:transport"),
280                ("moved", "movement:transport"),
281                ("transported", "movement:transport"),
282                ("deployed", "movement:transport"),
283                ("bought", "transaction:transfer-ownership"),
284                ("sold", "transaction:transfer-ownership"),
285                ("purchased", "transaction:transfer-ownership"),
286                ("acquired", "transaction:transfer-ownership"),
287                ("founded", "business:start-org"),
288                ("started", "business:start-org"),
289                ("merged", "business:merge-org"),
290                ("bankruptcy", "business:declare-bankruptcy"),
291                ("announced", "communication:announce"),
292                ("stated", "communication:announce"),
293                ("reported", "communication:announce"),
294                ("said", "communication:announce"),
295                ("born", "life:be-born"),
296                ("died", "life:die"),
297                ("married", "life:marry"),
298                ("divorced", "life:divorce"),
299                ("arrested", "justice:arrest-jail"),
300                ("charged", "justice:charge-indict"),
301                ("convicted", "justice:convict"),
302                ("sentenced", "justice:sentence"),
303            ],
304        };
305
306        // Use Unicode-aware lowercasing for multilingual support
307        // For CJK languages, case doesn't apply, but this is safe
308        let text_lower = text.to_lowercase();
309        for (trigger_word, event_type) in trigger_patterns {
310            // For CJK and other languages without case, search in original text too
311            let search_text = if lang_code
312                .as_deref()
313                .is_some_and(|l| matches!(l, "zh" | "ja" | "ko" | "ar" | "ru"))
314            {
315                text // No lowercasing for languages where case doesn't apply
316            } else {
317                &text_lower
318            };
319
320            if let Some(pos) = search_text.find(trigger_word) {
321                // Find character offset (not byte offset)
322                let char_start = text
323                    .char_indices()
324                    .nth(pos)
325                    .map(|(i, _)| i)
326                    .unwrap_or(text.len());
327                let char_end = text
328                    .char_indices()
329                    .nth(pos + trigger_word.chars().count())
330                    .map(|(i, _)| i)
331                    .unwrap_or(text.len());
332
333                // Extract actual trigger text (preserving original case)
334                let trigger_text: String = text
335                    .chars()
336                    .skip(pos)
337                    .take(trigger_word.chars().count())
338                    .collect();
339
340                let event = Event::new(trigger_text, char_start, char_end, event_type.to_string())
341                    .with_confidence(0.7); // Rule-based confidence
342
343                if event.confidence >= self.threshold {
344                    events.push(event);
345                }
346            }
347        }
348
349        Ok(events)
350    }
351
352    fn name(&self) -> &'static str {
353        "rule-based-event"
354    }
355
356    fn description(&self) -> &'static str {
357        "Rule-based event extraction using trigger word patterns"
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    fn test_rule_based_event_extraction() {
367        let extractor = RuleBasedEventExtractor::new();
368        let events = extractor
369            .extract_events("Russia invaded Ukraine in 2022.", None)
370            .unwrap();
371
372        assert!(!events.is_empty());
373        assert!(events
374            .iter()
375            .any(|e| e.trigger.to_lowercase() == "invaded" && e.event_type == "conflict:attack"));
376    }
377
378    #[test]
379    fn test_event_with_arguments() {
380        let event = Event::new("invaded", 7, 14, "conflict:attack");
381        // In a full implementation, we'd extract entities and link them as arguments
382        assert_eq!(event.arguments.len(), 0);
383    }
384
385    #[test]
386    fn test_event_unicode_offsets() {
387        let extractor = RuleBasedEventExtractor::new();
388        let text = "ロシアがウクライナを侵攻した。"; // "Russia invaded Ukraine" in Japanese
389        let events = extractor.extract_events(text, Some("ja")).unwrap();
390
391        // Verify offsets are character-based
392        for event in &events {
393            assert!(event.trigger_start <= event.trigger_end);
394            assert!(event.trigger_end <= text.chars().count());
395        }
396    }
397
398    #[test]
399    fn test_multilingual_event_extraction() {
400        let extractor = RuleBasedEventExtractor::new();
401
402        // Spanish - "invadió" should match
403        let events_es = extractor
404            .extract_events("Rusia invadió Ucrania en 2022.", Some("es"))
405            .unwrap();
406        assert!(!events_es.is_empty(), "Should extract Spanish events");
407
408        // French - "envahi" should match
409        let events_fr = extractor
410            .extract_events("La Russie a envahi l'Ukraine en 2022.", Some("fr"))
411            .unwrap();
412        assert!(!events_fr.is_empty(), "Should extract French events");
413
414        // German - "angegriffen" should match (past participle form)
415        let events_de = extractor
416            .extract_events("Russland hat die Ukraine 2022 angegriffen.", Some("de"))
417            .unwrap();
418        assert!(!events_de.is_empty(), "Should extract German events");
419    }
420}