use super::{DiscourseReferent, EventMention, EventPolarity, EventTense, ReferentType};
use crate::Entity;
use std::collections::HashMap;
#[cfg(feature = "candle")]
use crate::backends::gliner_candle::GLiNERCandle;
#[derive(Debug, Clone)]
pub struct EventTriggerLexicon {
triggers: HashMap<String, (String, Option<EventPolarity>)>,
modal_verbs: Vec<String>,
negation_words: Vec<String>,
}
impl Default for EventTriggerLexicon {
fn default() -> Self {
Self::new()
}
}
impl EventTriggerLexicon {
#[must_use]
pub fn new() -> Self {
let mut triggers = HashMap::new();
for word in &[
"attack",
"attacked",
"attacking",
"attacks",
"invade",
"invaded",
"invading",
"invades",
"invasion",
"bomb",
"bombed",
"bombing",
"bombs",
"bombardment",
"strike",
"struck",
"striking",
"strikes",
"assault",
"assaulted",
"assaulting",
"assaults",
"fight",
"fought",
"fighting",
"fights",
"battle",
"battled",
"battling",
"battles",
"war",
"warfare",
"kill",
"killed",
"killing",
"kills",
"murder",
"murdered",
"murdering",
"murders",
"shoot",
"shot",
"shooting",
"shoots",
"fire",
"fired",
"firing",
"fires",
"clash",
"clashed",
"clashing",
"clashes",
"destroy",
"destroyed",
"destroying",
"destroys",
"destruction",
] {
triggers.insert(word.to_string(), ("conflict:attack".to_string(), None));
}
for word in &[
"protest",
"protested",
"protesting",
"protests",
"demonstrate",
"demonstrated",
"demonstrating",
"demonstrates",
"demonstration",
"rally",
"rallied",
"rallying",
"rallies",
"march",
"marched",
"marching",
"marches",
"riot",
"rioted",
"rioting",
"riots",
] {
triggers.insert(word.to_string(), ("conflict:demonstrate".to_string(), None));
}
for word in &[
"go",
"went",
"going",
"goes",
"gone",
"move",
"moved",
"moving",
"moves",
"movement",
"travel",
"traveled",
"travelling",
"travels",
"arrive",
"arrived",
"arriving",
"arrives",
"arrival",
"depart",
"departed",
"departing",
"departs",
"departure",
"leave",
"left",
"leaving",
"leaves",
"return",
"returned",
"returning",
"returns",
"flee",
"fled",
"fleeing",
"flees",
"escape",
"escaped",
"escaping",
"escapes",
"transport",
"transported",
"transporting",
"transports",
"ship",
"shipped",
"shipping",
"ships",
"shipment",
"send",
"sent",
"sending",
"sends",
"deploy",
"deployed",
"deploying",
"deploys",
"deployment",
] {
triggers.insert(word.to_string(), ("movement:transport".to_string(), None));
}
for word in &[
"buy",
"bought",
"buying",
"buys",
"sell",
"sold",
"selling",
"sells",
"sale",
"acquire",
"acquired",
"acquiring",
"acquires",
"acquisition",
"purchase",
"purchased",
"purchasing",
"purchases",
"pay",
"paid",
"paying",
"pays",
"payment",
"donate",
"donated",
"donating",
"donates",
"donation",
"transfer",
"transferred",
"transferring",
"transfers",
"invest",
"invested",
"investing",
"invests",
"investment",
"fund",
"funded",
"funding",
"funds",
] {
triggers.insert(word.to_string(), ("transaction:transfer".to_string(), None));
}
for word in &[
"found",
"founded",
"founding",
"founds",
"foundation",
"start",
"started",
"starting",
"starts",
"establish",
"established",
"establishing",
"establishes",
"create",
"created",
"creating",
"creates",
"creation",
"launch",
"launched",
"launching",
"launches",
"open",
"opened",
"opening",
"opens",
] {
triggers.insert(word.to_string(), ("business:start-org".to_string(), None));
}
for word in &[
"merge",
"merged",
"merging",
"merges",
"merger",
"acquire",
"acquired",
"acquiring",
"acquires",
"acquisition",
"takeover",
"take over",
"took over",
] {
triggers.insert(word.to_string(), ("business:merge-org".to_string(), None));
}
for word in &[
"bankrupt",
"bankruptcy",
"collapse",
"collapsed",
"collapsing",
"collapses",
"fail",
"failed",
"failing",
"fails",
"failure",
"close",
"closed",
"closing",
"closes",
"closure",
"shutdown",
"shut down",
] {
triggers.insert(
word.to_string(),
(
"business:end-org".to_string(),
Some(EventPolarity::Negative),
),
);
}
for word in &[
"meet",
"met",
"meeting",
"meets",
"visit",
"visited",
"visiting",
"visits",
"talk",
"talked",
"talking",
"talks",
"discuss",
"discussed",
"discussing",
"discusses",
"discussion",
"negotiate",
"negotiated",
"negotiating",
"negotiates",
"negotiation",
"summit",
] {
triggers.insert(word.to_string(), ("contact:meet".to_string(), None));
}
for word in &[
"call",
"called",
"calling",
"calls",
"phone",
"phoned",
"phoning",
"phones",
"email",
"emailed",
"emailing",
"emails",
"write",
"wrote",
"writing",
"writes",
"announce",
"announced",
"announcing",
"announces",
"announcement",
"say",
"said",
"saying",
"says",
"state",
"stated",
"stating",
"states",
"statement",
"declare",
"declared",
"declaring",
"declares",
"declaration",
"report",
"reported",
"reporting",
"reports",
"claim",
"claimed",
"claiming",
"claims",
] {
triggers.insert(word.to_string(), ("contact:communicate".to_string(), None));
}
for word in &[
"hire",
"hired",
"hiring",
"hires",
"appoint",
"appointed",
"appointing",
"appoints",
"appointment",
"promote",
"promoted",
"promoting",
"promotes",
"promotion",
"elect",
"elected",
"electing",
"elects",
"election",
"nominate",
"nominated",
"nominating",
"nominates",
"nomination",
] {
triggers.insert(
word.to_string(),
("personnel:start-position".to_string(), None),
);
}
for word in &[
"resign",
"resigned",
"resigning",
"resigns",
"resignation",
"retire",
"retired",
"retiring",
"retires",
"retirement",
"fire",
"fired",
"firing",
"fires",
"dismiss",
"dismissed",
"dismissing",
"dismisses",
"dismissal",
"lay",
"laid",
"layoff",
"layoffs",
"quit",
"quitting",
"quits",
"leave",
"left",
"leaving",
] {
triggers.insert(
word.to_string(),
("personnel:end-position".to_string(), None),
);
}
for word in &[
"arrest",
"arrested",
"arresting",
"arrests",
"detain",
"detained",
"detaining",
"detains",
"detention",
"jail",
"jailed",
"jailing",
"jails",
"imprison",
"imprisoned",
"imprisoning",
"imprisons",
"imprisonment",
"capture",
"captured",
"capturing",
"captures",
] {
triggers.insert(word.to_string(), ("justice:arrest".to_string(), None));
}
for word in &[
"charge",
"charged",
"charging",
"charges",
"indict",
"indicted",
"indicting",
"indicts",
"indictment",
"accuse",
"accused",
"accusing",
"accuses",
"accusation",
"prosecute",
"prosecuted",
"prosecuting",
"prosecutes",
"prosecution",
] {
triggers.insert(word.to_string(), ("justice:charge".to_string(), None));
}
for word in &[
"convict",
"convicted",
"convicting",
"convicts",
"conviction",
"sentence",
"sentenced",
"sentencing",
"sentences",
"fine",
"fined",
"fining",
"fines",
"acquit",
"acquitted",
"acquitting",
"acquits",
"acquittal",
"pardon",
"pardoned",
"pardoning",
"pardons",
] {
triggers.insert(word.to_string(), ("justice:convict".to_string(), None));
}
for word in &[
"sue",
"sued",
"suing",
"sues",
"lawsuit",
"litigate",
"litigated",
"litigating",
"litigates",
"litigation",
] {
triggers.insert(word.to_string(), ("justice:sue".to_string(), None));
}
for word in &[
"release",
"released",
"releasing",
"releases",
"free",
"freed",
"freeing",
"frees",
"parole",
"paroled",
"paroling",
"paroles",
] {
triggers.insert(word.to_string(), ("justice:release".to_string(), None));
}
for word in &[
"born",
"birth",
"die",
"died",
"dying",
"dies",
"death",
"kill",
"killed",
"killing",
"kills",
"injure",
"injured",
"injuring",
"injures",
"injury",
"wound",
"wounded",
"wounding",
"wounds",
"marry",
"married",
"marrying",
"marries",
"marriage",
"wedding",
"divorce",
"divorced",
"divorcing",
"divorces",
] {
triggers.insert(word.to_string(), ("life:event".to_string(), None));
}
for word in &[
"earthquake",
"quake",
"flood",
"flooded",
"flooding",
"floods",
"hurricane",
"typhoon",
"cyclone",
"storm",
"tornado",
"tornadoes",
"wildfire",
"fire",
"fires",
"tsunami",
"drought",
"eruption",
"erupted",
"erupting",
"erupts",
"landslide",
"avalanche",
] {
triggers.insert(word.to_string(), ("disaster:natural".to_string(), None));
}
for word in &[
"crash",
"crashed",
"crashing",
"crashes",
"fail",
"failed",
"failing",
"fails",
"failure",
"break",
"broke",
"breaking",
"breaks",
"breakdown",
"malfunction",
"malfunctioned",
"malfunctioning",
"outage",
"hack",
"hacked",
"hacking",
"hacks",
"breach",
"breached",
"breaching",
"breaches",
] {
triggers.insert(word.to_string(), ("technical:failure".to_string(), None));
}
for word in &[
"rise",
"rose",
"rising",
"rises",
"fall",
"fell",
"falling",
"falls",
"increase",
"increased",
"increasing",
"increases",
"decrease",
"decreased",
"decreasing",
"decreases",
"grow",
"grew",
"growing",
"grows",
"growth",
"shrink",
"shrank",
"shrinking",
"shrinks",
"spike",
"spiked",
"spiking",
"spikes",
"plunge",
"plunged",
"plunging",
"plunges",
"surge",
"surged",
"surging",
"surges",
"drop",
"dropped",
"dropping",
"drops",
] {
triggers.insert(word.to_string(), ("economic:change".to_string(), None));
}
let modal_verbs = vec!["might", "may", "could", "would", "should", "can", "will"]
.into_iter()
.map(String::from)
.collect();
let negation_words = vec![
"not",
"never",
"no",
"none",
"neither",
"nobody",
"nothing",
"nowhere",
"hardly",
"scarcely",
"barely",
"don't",
"doesn't",
"didn't",
"won't",
"wouldn't",
"couldn't",
"shouldn't",
"can't",
"cannot",
"hasn't",
"haven't",
"hadn't",
"isn't",
"aren't",
"wasn't",
"weren't",
]
.into_iter()
.map(String::from)
.collect();
Self {
triggers,
modal_verbs,
negation_words,
}
}
#[must_use]
pub fn lookup(&self, word: &str) -> Option<(&str, Option<EventPolarity>)> {
let lower = word.to_lowercase();
self.triggers.get(&lower).map(|(t, p)| (t.as_str(), *p))
}
#[must_use]
pub fn is_modal(&self, word: &str) -> bool {
let lower = word.to_lowercase();
self.modal_verbs.iter().any(|m| m == &lower)
}
#[must_use]
pub fn is_negation(&self, word: &str) -> bool {
let lower = word.to_lowercase();
self.negation_words.iter().any(|n| n == &lower)
}
}
#[derive(Debug, Clone)]
pub struct EventExtractorConfig {
pub min_confidence: f64,
pub max_arg_distance: usize,
pub extract_nested: bool,
#[cfg(feature = "candle")]
pub gliner_model: Option<String>,
pub event_labels: Vec<String>,
}
impl Default for EventExtractorConfig {
fn default() -> Self {
Self {
min_confidence: 0.5,
max_arg_distance: 10,
extract_nested: false,
#[cfg(feature = "candle")]
gliner_model: None,
event_labels: Self::default_event_labels(),
}
}
}
impl EventExtractorConfig {
fn default_event_labels() -> Vec<String> {
vec![
"conflict event".into(), "movement event".into(), "transaction event".into(), "business event".into(), "communication event".into(), "personnel event".into(), "justice event".into(), "life event".into(), "disaster event".into(), "economic change".into(), ]
}
#[cfg(feature = "candle")]
#[must_use]
pub fn with_gliner(mut self, model_id: impl Into<String>) -> Self {
self.gliner_model = Some(model_id.into());
self
}
#[must_use]
pub fn with_event_labels(mut self, labels: Vec<String>) -> Self {
self.event_labels = labels;
self
}
#[must_use]
pub fn with_threshold(mut self, threshold: f64) -> Self {
self.min_confidence = threshold;
self
}
}
#[derive(Debug)]
pub struct EventExtractor {
lexicon: EventTriggerLexicon,
config: EventExtractorConfig,
#[cfg(feature = "candle")]
gliner: Option<GLiNERCandle>,
}
impl Clone for EventExtractor {
fn clone(&self) -> Self {
Self {
lexicon: self.lexicon.clone(),
config: self.config.clone(),
#[cfg(feature = "candle")]
gliner: None, }
}
}
impl Default for EventExtractor {
fn default() -> Self {
Self::new()
}
}
impl EventExtractor {
#[must_use]
pub fn new() -> Self {
Self {
lexicon: EventTriggerLexicon::new(),
config: EventExtractorConfig::default(),
#[cfg(feature = "candle")]
gliner: None,
}
}
pub fn with_config(config: EventExtractorConfig) -> crate::Result<Self> {
#[cfg(feature = "candle")]
let gliner = if let Some(ref model_id) = config.gliner_model {
log::info!("[EventExtractor] Loading GLiNER model: {}", model_id);
Some(GLiNERCandle::from_pretrained(model_id)?)
} else {
None
};
Ok(Self {
lexicon: EventTriggerLexicon::new(),
config,
#[cfg(feature = "candle")]
gliner,
})
}
#[cfg(feature = "candle")]
pub fn with_gliner(model_id: &str) -> crate::Result<Self> {
let config = EventExtractorConfig::default().with_gliner(model_id);
Self::with_config(config)
}
#[must_use]
pub fn has_neural_backend(&self) -> bool {
#[cfg(feature = "candle")]
{
self.gliner.is_some()
}
#[cfg(not(feature = "candle"))]
{
false
}
}
#[must_use]
pub fn extract(&self, text: &str) -> Vec<EventMention> {
#[cfg(feature = "candle")]
if let Some(ref gliner) = self.gliner {
if let Ok(events) = self.extract_with_gliner(gliner, text) {
return events;
}
log::warn!("[EventExtractor] GLiNER extraction failed, falling back to rules");
}
self.extract_rule_based(text)
}
#[cfg(feature = "candle")]
fn extract_with_gliner(
&self,
gliner: &GLiNERCandle,
text: &str,
) -> crate::Result<Vec<EventMention>> {
let entities = gliner.extract(
text,
&self
.config
.event_labels
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>(),
self.config.min_confidence as f32,
)?;
let words = self.tokenize(text);
let mut events = Vec::new();
for entity in entities {
let event_type = self.map_gliner_label_to_event_type(&entity.entity_type.to_string());
let word_idx = words
.iter()
.position(|(_, start, end)| *start <= entity.start() && *end >= entity.end())
.unwrap_or(0);
let polarity = self.detect_polarity(&words, word_idx, None);
let tense = self.detect_tense(&words, word_idx, &entity.text);
let arguments = self.extract_arguments(text, &words, word_idx);
let mut event = EventMention::new(&entity.text, entity.start(), entity.end())
.with_trigger_type(event_type)
.with_polarity(polarity)
.with_confidence(entity.confidence.into());
if let Some(t) = tense {
event = event.with_tense(t);
}
if !arguments.is_empty() {
event.arguments = arguments;
}
events.push(event);
}
Ok(events)
}
#[cfg(feature = "candle")]
fn map_gliner_label_to_event_type(&self, label: &str) -> &'static str {
let label_lower = label.to_lowercase();
if label_lower.contains("conflict") || label_lower.contains("attack") {
"conflict:attack"
} else if label_lower.contains("movement") || label_lower.contains("travel") {
"movement:transport"
} else if label_lower.contains("transaction") || label_lower.contains("transfer") {
"transaction:transfer"
} else if label_lower.contains("business") || label_lower.contains("company") {
"business:event"
} else if label_lower.contains("communication") || label_lower.contains("announce") {
"contact:communicate"
} else if label_lower.contains("personnel") || label_lower.contains("hire") {
"personnel:event"
} else if label_lower.contains("justice") || label_lower.contains("legal") {
"justice:event"
} else if label_lower.contains("life")
|| label_lower.contains("birth")
|| label_lower.contains("death")
{
"life:event"
} else if label_lower.contains("disaster") || label_lower.contains("earthquake") {
"disaster:natural"
} else if label_lower.contains("economic") || label_lower.contains("change") {
"economic:change"
} else {
"event:generic"
}
}
fn extract_rule_based(&self, text: &str) -> Vec<EventMention> {
let mut events = Vec::new();
let words = self.tokenize(text);
for (word_idx, (word, start, end)) in words.iter().enumerate() {
if let Some((event_type, polarity_hint)) = self.lexicon.lookup(word) {
let polarity = self.detect_polarity(&words, word_idx, polarity_hint);
let tense = self.detect_tense(&words, word_idx, word);
let arguments = self.extract_arguments(text, &words, word_idx);
let mut event = EventMention::new(word.clone(), *start, *end)
.with_trigger_type(event_type)
.with_polarity(polarity)
.with_confidence(0.8);
if let Some(t) = tense {
event = event.with_tense(t);
}
if !arguments.is_empty() {
event.arguments = arguments;
}
events.push(event);
}
}
events
}
#[must_use]
pub fn extract_referents(&self, text: &str) -> Vec<DiscourseReferent> {
let events = self.extract(text);
events
.into_iter()
.map(|event| {
let (clause_byte_start, clause_byte_end) =
self.find_clause_span(text, event.trigger_start, event.trigger_end);
let clause_char_start = text[..clause_byte_start].chars().count();
let clause_text = &text[clause_byte_start..clause_byte_end];
let clause_char_end = clause_char_start + clause_text.chars().count();
DiscourseReferent::new(ReferentType::Event, clause_char_start, clause_char_end)
.with_event(event.clone())
.with_text(clause_text)
.with_confidence(event.confidence.value())
})
.collect()
}
#[must_use]
pub fn extract_with_entities(&self, text: &str, entities: &[Entity]) -> Vec<EventMention> {
let mut events = self.extract(text);
for event in &mut events {
let event_span = (event.trigger_start, event.trigger_end);
let agents: Vec<_> = entities
.iter()
.filter(|e| e.end() <= event_span.0 && event_span.0 - e.end() < 50)
.collect();
let patients: Vec<_> = entities
.iter()
.filter(|e| e.start() >= event_span.1 && e.start() - event_span.1 < 50)
.collect();
if let Some(agent) = agents.last() {
event.arguments.retain(|(r, _)| r != "Agent");
event
.arguments
.push(("Agent".to_string(), agent.text.clone()));
}
if let Some(patient) = patients.first() {
event.arguments.retain(|(r, _)| r != "Patient");
event
.arguments
.push(("Patient".to_string(), patient.text.clone()));
}
}
events
}
fn tokenize(&self, text: &str) -> Vec<(String, usize, usize)> {
let mut tokens = Vec::new();
let mut word_start: Option<(usize, usize)> = None;
let mut char_pos = 0usize;
for (i, c) in text.char_indices() {
if c.is_alphanumeric() || c == '\'' || c == '-' {
if word_start.is_none() {
word_start = Some((i, char_pos));
}
} else if let Some((start_byte, start_char)) = word_start {
let word = &text[start_byte..i];
tokens.push((word.to_string(), start_char, char_pos));
word_start = None;
}
char_pos += 1;
}
if let Some((start_byte, start_char)) = word_start {
let word = &text[start_byte..];
tokens.push((word.to_string(), start_char, char_pos));
}
tokens
}
fn detect_polarity(
&self,
words: &[(String, usize, usize)],
trigger_idx: usize,
hint: Option<EventPolarity>,
) -> EventPolarity {
let start = trigger_idx.saturating_sub(3);
for (word, _, _) in &words[start..trigger_idx] {
if self.lexicon.is_negation(word) {
return EventPolarity::Negative;
}
}
for (word, _, _) in &words[start..trigger_idx] {
if self.lexicon.is_modal(word) {
return EventPolarity::Uncertain;
}
}
hint.unwrap_or(EventPolarity::Positive)
}
fn detect_tense(
&self,
words: &[(String, usize, usize)],
trigger_idx: usize,
trigger: &str,
) -> Option<EventTense> {
let trigger_lower = trigger.to_lowercase();
let start = trigger_idx.saturating_sub(3);
for (word, _, _) in &words[start..trigger_idx] {
let w = word.to_lowercase();
if w == "will" || w == "going" || w == "shall" {
return Some(EventTense::Future);
}
if w == "would" || w == "could" || w == "might" || w == "may" {
return Some(EventTense::Hypothetical);
}
}
if trigger_lower.ends_with("ed")
|| matches!(
trigger_lower.as_str(),
"went"
| "came"
| "said"
| "took"
| "gave"
| "made"
| "got"
| "found"
| "knew"
| "thought"
| "felt"
| "became"
| "left"
| "held"
| "brought"
| "began"
| "kept"
| "put"
| "set"
| "saw"
| "heard"
| "told"
| "stood"
| "lost"
| "paid"
| "met"
| "ran"
| "sent"
| "built"
| "fell"
| "caught"
| "wrote"
| "sat"
| "led"
| "rose"
| "spoke"
| "won"
| "broke"
| "spent"
| "hit"
| "cut"
| "sold"
| "bought"
| "shot"
| "struck"
| "shut"
| "threw"
| "drove"
| "flew"
| "drew"
| "grew"
| "sang"
| "swam"
| "rang"
| "wore"
| "chose"
| "woke"
| "froze"
| "stole"
| "blew"
| "ate"
| "drank"
| "rode"
| "shook"
| "bit"
| "hid"
| "tore"
| "beat"
| "laid"
| "spread"
| "hurt"
| "fought"
| "hung"
| "slept"
| "swept"
| "bent"
| "dealt"
| "fed"
| "fled"
| "dug"
| "spun"
| "wove"
| "sank"
| "shone"
| "swung"
| "clung"
| "crept"
| "burnt"
| "leapt"
| "meant"
| "lent"
| "dwelt"
| "dreamt"
| "knelt"
| "split"
| "spit"
| "bid"
| "forbid"
| "shed"
| "rid"
| "burst"
| "stuck"
| "slid"
)
{
return Some(EventTense::Past);
}
if trigger_lower.ends_with("ing") {
return Some(EventTense::Present);
}
None
}
fn extract_arguments(
&self,
_text: &str,
words: &[(String, usize, usize)],
trigger_idx: usize,
) -> Vec<(String, String)> {
let mut arguments = Vec::new();
if trigger_idx > 0 {
for (word, _, _) in words[..trigger_idx].iter().rev().take(5) {
if word
.chars()
.next()
.map(|c| c.is_uppercase())
.unwrap_or(false)
&& word.len() > 1
&& !Self::is_sentence_start_word(word)
{
arguments.push(("Agent".to_string(), word.clone()));
break;
}
}
}
if trigger_idx + 1 < words.len() {
for (word, _, _) in words[trigger_idx + 1..].iter().take(5) {
if word
.chars()
.next()
.map(|c| c.is_uppercase())
.unwrap_or(false)
&& word.len() > 1
{
arguments.push(("Patient".to_string(), word.clone()));
break;
}
}
}
arguments
}
fn is_sentence_start_word(word: &str) -> bool {
matches!(
word.to_lowercase().as_str(),
"the"
| "a"
| "an"
| "this"
| "that"
| "these"
| "those"
| "it"
| "he"
| "she"
| "they"
| "we"
| "i"
)
}
fn find_clause_span(
&self,
text: &str,
trigger_start: usize,
trigger_end: usize,
) -> (usize, usize) {
let trigger_byte_start = text
.char_indices()
.nth(trigger_start)
.map(|(byte_idx, _)| byte_idx)
.unwrap_or(text.len());
let trigger_byte_end = text
.char_indices()
.nth(trigger_end)
.map(|(byte_idx, _)| byte_idx)
.unwrap_or(text.len());
let mut clause_start = 0;
for (i, c) in text[..trigger_byte_start].char_indices().rev() {
if matches!(c, '.' | '!' | '?' | ';' | ':') {
clause_start = i + 1;
while clause_start < trigger_byte_start {
let rest = match text.get(clause_start..) {
Some(r) => r,
None => break,
};
match rest.chars().next() {
Some(c) if c.is_whitespace() => clause_start += c.len_utf8(),
_ => break,
}
}
break;
}
}
let mut clause_end = text.len();
for (i, c) in text[trigger_byte_end..].char_indices() {
if matches!(c, '.' | '!' | '?' | ';' | ':' | ',') {
clause_end = trigger_byte_end + i + 1;
break;
}
}
(clause_start, clause_end)
}
#[must_use]
pub fn lexicon(&self) -> &EventTriggerLexicon {
&self.lexicon
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::offset::TextSpan;
#[test]
fn test_basic_extraction() {
let extractor = EventExtractor::new();
let events = extractor.extract("Russia invaded Ukraine in 2022.");
assert_eq!(events.len(), 1);
assert_eq!(events[0].trigger, "invaded");
assert_eq!(events[0].trigger_type.as_deref(), Some("conflict:attack"));
assert_eq!(events[0].tense, Some(EventTense::Past));
}
#[test]
fn test_multiple_events() {
let extractor = EventExtractor::new();
let events = extractor.extract("The company announced layoffs. Stocks crashed.");
assert!(
events.len() >= 2,
"Should extract at least 2 events, got {}",
events.len()
);
assert!(events.iter().any(|e| e.trigger == "announced"));
assert!(events.iter().any(|e| e.trigger == "crashed"));
}
#[test]
fn test_negation_detection() {
let extractor = EventExtractor::new();
let events = extractor.extract("They did not attack the city.");
assert_eq!(events[0].polarity, EventPolarity::Negative);
let events = extractor.extract("They attacked the city.");
assert_eq!(events[0].polarity, EventPolarity::Positive);
}
#[test]
fn test_hypothetical_detection() {
let extractor = EventExtractor::new();
let events = extractor.extract("They might attack tomorrow.");
assert_eq!(events[0].polarity, EventPolarity::Uncertain);
assert_eq!(events[0].tense, Some(EventTense::Hypothetical));
}
#[test]
fn test_argument_extraction() {
let extractor = EventExtractor::new();
let events = extractor.extract("Russia invaded Ukraine.");
assert!(!events.is_empty());
let event = &events[0];
assert!(event
.arguments
.iter()
.any(|(r, v)| r == "Agent" && v == "Russia"));
assert!(event
.arguments
.iter()
.any(|(r, v)| r == "Patient" && v == "Ukraine"));
}
#[test]
fn test_discourse_referent_extraction() {
let extractor = EventExtractor::new();
let referents =
extractor.extract_referents("The earthquake struck at dawn. This shocked everyone.");
assert!(!referents.is_empty());
let referent = &referents[0];
assert_eq!(referent.referent_type, ReferentType::Event);
assert!(referent.event.is_some());
assert!(referent.text.as_ref().unwrap().contains("earthquake"));
}
#[test]
fn test_lexicon_coverage() {
let lexicon = EventTriggerLexicon::new();
assert!(lexicon.lookup("invaded").is_some());
assert!(lexicon.lookup("announced").is_some());
assert!(lexicon.lookup("arrested").is_some());
assert!(lexicon.lookup("earthquake").is_some());
assert!(lexicon.lookup("crashed").is_some());
assert!(lexicon.is_negation("not"));
assert!(lexicon.is_negation("never"));
assert!(lexicon.is_modal("might"));
assert!(lexicon.is_modal("could"));
}
#[test]
fn test_with_entities() {
let extractor = EventExtractor::new();
let text = "Apple Inc. announced record profits.";
let entities = vec![Entity::new(
"Apple Inc.",
crate::EntityType::Organization,
0,
10,
0.9,
)];
let events = extractor.extract_with_entities(text, &entities);
assert!(!events.is_empty(), "Should extract at least one event");
let announcement = events.iter().find(|e| e.trigger == "announced");
assert!(announcement.is_some(), "Should find 'announced' event");
let event = announcement.unwrap();
assert!(
event
.arguments
.iter()
.any(|(r, v)| r == "Agent" && v.contains("Apple")),
"Should extract Apple Inc. as Agent, got: {:?}",
event.arguments
);
}
#[test]
fn test_has_neural_backend() {
let extractor = EventExtractor::new();
#[cfg(not(feature = "candle"))]
assert!(!extractor.has_neural_backend());
#[cfg(feature = "candle")]
assert!(!extractor.has_neural_backend());
}
#[test]
fn test_config_builder() {
let config = EventExtractorConfig::default()
.with_threshold(0.7)
.with_event_labels(vec!["custom event".into()]);
assert_eq!(config.min_confidence, 0.7);
assert_eq!(config.event_labels, vec!["custom event"]);
}
#[test]
fn test_event_offsets_are_character_offsets_on_unicode_prefix() {
let extractor = EventExtractor::new();
let text = "🎉 Dr. Müller attacked Kyiv.";
let events = extractor.extract(text);
let attacked = events
.iter()
.find(|e| e.trigger == "attacked")
.expect("should extract 'attacked' trigger");
let extracted =
TextSpan::from_chars(text, attacked.trigger_start, attacked.trigger_end).extract(text);
assert_eq!(extracted, "attacked");
}
#[test]
fn test_clause_span_skips_unicode_whitespace_without_panicking() {
let extractor = EventExtractor::new();
let text = "Intro.\u{3000}Russia invaded Ukraine in 2022. This shocked everyone.";
let referents = extractor.extract_referents(text);
assert!(!referents.is_empty());
assert!(
referents.iter().any(|r| r
.text
.as_deref()
.unwrap_or("")
.contains("Russia invaded Ukraine")),
"Expected an event referent clause containing the invasion clause"
);
}
}