Skip to main content

anno/preprocess/
parenthetical.rs

1//! Parenthetical text analysis and entity extraction.
2//!
3//! # Overview
4//!
5//! Parentheticals are text enclosed in parentheses, brackets, or similar delimiters
6//! that often contain valuable entity-related information:
7//!
8//! - **Aliases**: "Barack Obama (Barry)" - alternate names
9//! - **Abbreviations**: "World Health Organization (WHO)"
10//! - **Clarifications**: "The Big Apple (New York City)"
11//! - **Stock tickers**: "Apple Inc. (AAPL)"
12//! - **Temporal bounds**: "Napoleon Bonaparte (1769-1821)"
13//! - **Translations**: "台北 (Taipei)"
14//! - **Descriptions**: "John Smith (CEO of Acme Corp)"
15//!
16//! # Integration with Coalesce
17//!
18//! Parenthetical information provides crucial aliases for cross-document
19//! entity coalescing. When "WHO" appears in one document and "World Health
20//! Organization" in another, the parenthetical establishes the link.
21//!
22//! # Example
23//!
24//! ```rust
25//! use anno::preprocess::parenthetical::{ParentheticalExtractor, ParentheticalType};
26//!
27//! let extractor = ParentheticalExtractor::new();
28//! let text = "Apple Inc. (AAPL) reported earnings.";
29//! let results = extractor.extract(text);
30//!
31//! assert_eq!(results.len(), 1);
32//! assert_eq!(results[0].antecedent, "Apple Inc.");
33//! assert_eq!(results[0].content, "AAPL");
34//! assert_eq!(results[0].parenthetical_type, ParentheticalType::Ticker);
35//! ```
36
37use crate::offset::TextSpan;
38use serde::{Deserialize, Serialize};
39
40/// Type of parenthetical content.
41#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
42pub enum ParentheticalType {
43    /// Abbreviation/acronym: "World Health Organization (WHO)"
44    Abbreviation,
45    /// Stock ticker: "Apple Inc. (AAPL)"
46    Ticker,
47    /// Alternate name/alias: "William Shakespeare (The Bard)"
48    Alias,
49    /// Temporal bounds: "Napoleon (1769-1821)"
50    TemporalBounds,
51    /// Translation/transliteration: "北京 (Beijing)"
52    Translation,
53    /// Clarification/description: "the company (based in Seattle)"
54    Clarification,
55    /// Cross-reference: "see Section 3 (above)"
56    CrossReference,
57    /// Citation: "[Smith et al., 2020]"
58    Citation,
59    /// Role/title: "John Smith (CEO)"
60    Role,
61    /// Location qualifier: "Cambridge (Massachusetts)"
62    LocationQualifier,
63    /// Quantity/measurement: "500ml (about 2 cups)"
64    Measurement,
65    /// Unknown type
66    #[default]
67    Unknown,
68}
69
70/// A parenthetical extraction result.
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct Parenthetical {
73    /// The text preceding the parenthetical (the "antecedent")
74    pub antecedent: String,
75    /// The content inside the parentheses
76    pub content: String,
77    /// Start offset of the entire span (antecedent + parenthetical)
78    pub start: usize,
79    /// End offset of the entire span
80    pub end: usize,
81    /// Start offset of just the parenthetical content
82    pub content_start: usize,
83    /// End offset of just the parenthetical content
84    pub content_end: usize,
85    /// Type of parenthetical
86    pub parenthetical_type: ParentheticalType,
87    /// Confidence in the classification
88    pub confidence: f64,
89    /// Whether this creates an alias relationship
90    pub is_alias: bool,
91}
92
93impl Parenthetical {
94    /// Create a new parenthetical.
95    pub fn new(
96        antecedent: &str,
97        content: &str,
98        start: usize,
99        end: usize,
100        content_start: usize,
101        content_end: usize,
102    ) -> Self {
103        Self {
104            antecedent: antecedent.to_string(),
105            content: content.to_string(),
106            start,
107            end,
108            content_start,
109            content_end,
110            parenthetical_type: ParentheticalType::Unknown,
111            confidence: 0.5,
112            is_alias: false,
113        }
114    }
115
116    /// Set the type.
117    pub fn with_type(mut self, ptype: ParentheticalType) -> Self {
118        self.parenthetical_type = ptype;
119        self
120    }
121
122    /// Check if this represents an abbreviation.
123    pub fn is_abbreviation(&self) -> bool {
124        matches!(self.parenthetical_type, ParentheticalType::Abbreviation)
125    }
126
127    /// Check if this represents a stock ticker.
128    pub fn is_ticker(&self) -> bool {
129        matches!(self.parenthetical_type, ParentheticalType::Ticker)
130    }
131
132    /// Check if this provides temporal bounds for an entity.
133    pub fn is_temporal(&self) -> bool {
134        matches!(self.parenthetical_type, ParentheticalType::TemporalBounds)
135    }
136
137    /// Get the alias if this parenthetical creates one.
138    ///
139    /// For abbreviations and aliases, returns the content.
140    /// For some types, the antecedent might be the alias.
141    pub fn get_alias(&self) -> Option<&str> {
142        if self.is_alias {
143            Some(&self.content)
144        } else {
145            None
146        }
147    }
148}
149
150/// Extractor for parenthetical information.
151#[derive(Debug, Clone, Default)]
152pub struct ParentheticalExtractor {
153    /// Minimum antecedent length to consider
154    min_antecedent_len: usize,
155    /// Maximum parenthetical content length
156    max_content_len: usize,
157}
158
159impl ParentheticalExtractor {
160    /// Create a new extractor with default settings.
161    pub fn new() -> Self {
162        Self {
163            min_antecedent_len: 2,
164            max_content_len: 100,
165        }
166    }
167
168    /// Set minimum antecedent length.
169    pub fn with_min_antecedent(mut self, len: usize) -> Self {
170        self.min_antecedent_len = len;
171        self
172    }
173
174    /// Extract parentheticals from text.
175    pub fn extract(&self, text: &str) -> Vec<Parenthetical> {
176        let mut results = Vec::new();
177        let chars: Vec<(usize, char)> = text.char_indices().collect();
178        let mut i = 0;
179
180        while i < chars.len() {
181            if chars[i].1 == '(' {
182                let open_idx = chars[i].0;
183
184                // Find matching close paren
185                let mut depth = 1;
186                let mut j = i + 1;
187                while j < chars.len() && depth > 0 {
188                    match chars[j].1 {
189                        '(' => depth += 1,
190                        ')' => depth -= 1,
191                        _ => {}
192                    }
193                    j += 1;
194                }
195
196                if depth == 0 && j > i + 1 {
197                    let close_idx = chars[j - 1].0;
198                    let content_start = open_idx + 1;
199                    let content_end = close_idx;
200                    let content = &text[content_start..content_end];
201
202                    // Skip if content too long
203                    if content.chars().count() <= self.max_content_len {
204                        // Find antecedent (text before the parenthetical)
205                        let (antecedent, antecedent_start_byte, _antecedent_end_byte) =
206                            self.find_antecedent(text, open_idx);
207
208                        if antecedent.chars().count() >= self.min_antecedent_len {
209                            let start_byte = antecedent_start_byte;
210                            let end_byte = close_idx + 1; // ')' is ASCII
211                            let span = TextSpan::from_bytes(text, start_byte, end_byte);
212                            let content_span =
213                                TextSpan::from_bytes(text, content_start, content_end);
214
215                            let mut paren = Parenthetical::new(
216                                &antecedent,
217                                content,
218                                span.char_start,
219                                span.char_end,
220                                content_span.char_start,
221                                content_span.char_end,
222                            );
223
224                            // Classify the parenthetical
225                            paren = self.classify(paren);
226
227                            results.push(paren);
228                        }
229                    }
230                }
231                i = j;
232            } else {
233                i += 1;
234            }
235        }
236
237        results
238    }
239
240    /// Find the antecedent (text before the parenthetical).
241    ///
242    /// Returns:
243    /// - antecedent text (trimmed)
244    /// - antecedent start byte offset (inclusive)
245    /// - antecedent end byte offset (exclusive)
246    fn find_antecedent(&self, text: &str, paren_start: usize) -> (String, usize, usize) {
247        if paren_start == 0 {
248            return (String::new(), 0, 0);
249        }
250
251        // Work backwards from the parenthesis
252        let before = &text[..paren_start];
253        let trimmed = before.trim_end();
254        let trimmed_end = trimmed.len(); // byte offset in `text` (trimmed is a prefix slice)
255
256        // Find the start of the phrase, but ignore periods in common abbreviations
257        // like "Inc.", "Corp.", "Ltd.", "Dr.", "Mr.", "Mrs.", "Ms.", "Jr.", "Sr."
258        let abbrev_suffixes = [
259            "Inc.", "Corp.", "Ltd.", "LLC.", "Co.", "Ltd", "Dr.", "Mr.", "Mrs.", "Ms.", "Jr.",
260            "Sr.", "Ph.D.", "M.D.", "Prof.", "Rev.", "Gen.", "Col.", "Capt.", "Sgt.", "St.", "Mt.",
261            "Ave.", "Blvd.", "Rd.",
262        ];
263
264        // Find sentence boundaries, but skip if it's part of an abbreviation
265        let mut phrase_start = 0;
266        let bytes = trimmed.as_bytes();
267
268        for i in (0..bytes.len()).rev() {
269            let c = bytes[i] as char;
270            if c == '.' || c == ',' || c == ';' || c == ':' || c == '\n' {
271                // Check if this is an abbreviation
272                let suffix = &trimmed[..=i];
273                let is_abbrev = abbrev_suffixes.iter().any(|abbr| suffix.ends_with(abbr));
274
275                if !is_abbrev || c != '.' {
276                    phrase_start = i + 1;
277                    break;
278                }
279            }
280        }
281
282        // Skip any leading whitespace between phrase_start and trimmed_end.
283        let mut antecedent_start = phrase_start;
284        for (rel, c) in trimmed[phrase_start..].char_indices() {
285            if !c.is_whitespace() {
286                antecedent_start = phrase_start + rel;
287                break;
288            }
289        }
290
291        let antecedent = trimmed[antecedent_start..trimmed_end].to_string();
292        (antecedent, antecedent_start, trimmed_end)
293    }
294
295    /// Classify the type of parenthetical.
296    fn classify(&self, mut paren: Parenthetical) -> Parenthetical {
297        let content = paren.content.trim();
298        let antecedent = paren.antecedent.trim();
299
300        // Check for stock ticker: all caps, 1-5 letters
301        if content.len() <= 5
302            && content.chars().all(|c| c.is_ascii_uppercase())
303            && !content.is_empty()
304        {
305            // Check if antecedent looks like a company name
306            if antecedent.ends_with("Inc.")
307                || antecedent.ends_with("Corp.")
308                || antecedent.ends_with("Ltd.")
309                || antecedent.ends_with("LLC")
310                || antecedent.ends_with("Company")
311            {
312                paren.parenthetical_type = ParentheticalType::Ticker;
313                paren.is_alias = true;
314                paren.confidence = 0.9;
315                return paren;
316            }
317        }
318
319        // Check for abbreviation/acronym
320        if self.is_likely_abbreviation(antecedent, content) {
321            paren.parenthetical_type = ParentheticalType::Abbreviation;
322            paren.is_alias = true;
323            paren.confidence = 0.85;
324            return paren;
325        }
326
327        // Check for temporal bounds (years, date ranges)
328        if self.is_temporal_bounds(content) {
329            paren.parenthetical_type = ParentheticalType::TemporalBounds;
330            paren.confidence = 0.9;
331            return paren;
332        }
333
334        // Check for translation (contains non-ASCII)
335        if !content.is_ascii() || !antecedent.is_ascii() {
336            paren.parenthetical_type = ParentheticalType::Translation;
337            paren.is_alias = true;
338            paren.confidence = 0.7;
339            return paren;
340        }
341
342        // Check for role/title
343        if self.is_role(content) {
344            paren.parenthetical_type = ParentheticalType::Role;
345            paren.confidence = 0.8;
346            return paren;
347        }
348
349        // Check for location qualifier
350        if self.is_location_qualifier(content) {
351            paren.parenthetical_type = ParentheticalType::LocationQualifier;
352            paren.confidence = 0.75;
353            return paren;
354        }
355
356        // Check for citation
357        if content.starts_with('[')
358            || content.contains("et al")
359            || content.contains("19")
360            || content.contains("20")
361        {
362            paren.parenthetical_type = ParentheticalType::Citation;
363            paren.confidence = 0.7;
364            return paren;
365        }
366
367        // Default to alias if short content that looks like a name
368        if content.split_whitespace().count() <= 3
369            && content
370                .chars()
371                .next()
372                .map(|c| c.is_uppercase())
373                .unwrap_or(false)
374        {
375            paren.parenthetical_type = ParentheticalType::Alias;
376            paren.is_alias = true;
377            paren.confidence = 0.6;
378            return paren;
379        }
380
381        // Default to clarification
382        paren.parenthetical_type = ParentheticalType::Clarification;
383        paren.confidence = 0.5;
384        paren
385    }
386
387    /// Check if content is likely an abbreviation of antecedent.
388    fn is_likely_abbreviation(&self, antecedent: &str, content: &str) -> bool {
389        // All caps content
390        if !content
391            .chars()
392            .all(|c| c.is_uppercase() || c.is_whitespace() || c == '.')
393        {
394            return false;
395        }
396
397        // Check if initials match
398        let antecedent_initials: String = antecedent
399            .split_whitespace()
400            .filter_map(|w| w.chars().next())
401            .filter(|c| c.is_uppercase())
402            .collect();
403
404        let content_letters: String = content.chars().filter(|c| c.is_alphabetic()).collect();
405
406        if antecedent_initials == content_letters {
407            return true;
408        }
409
410        // Check if content could be abbreviation (3+ uppercase letters)
411        content.len() >= 2 && content.len() <= 10
412    }
413
414    /// Check if content represents temporal bounds (birth-death years, etc.)
415    fn is_temporal_bounds(&self, content: &str) -> bool {
416        // Match patterns like "1769-1821", "b. 1950", "1920s", "born 1985"
417        let patterns = [
418            r"^\d{4}\s*[-–—]\s*\d{4}$",            // 1769-1821
419            r"^\d{4}\s*[-–—]\s*(present|\d{4})?$", // 1990-present or 1990-
420            r"^b\.\s*\d{4}$",                      // b. 1950
421            r"^d\.\s*\d{4}$",                      // d. 2020
422            r"^born\s+\d{4}$",                     // born 1985
423            r"^\d{4}s$",                           // 1920s
424        ];
425
426        for pattern in &patterns {
427            if let Ok(re) = regex::Regex::new(pattern) {
428                if re.is_match(content) {
429                    return true;
430                }
431            }
432        }
433
434        false
435    }
436
437    /// Check if content looks like a role/title.
438    fn is_role(&self, content: &str) -> bool {
439        let role_indicators = [
440            "CEO",
441            "CFO",
442            "CTO",
443            "COO",
444            "CMO",
445            "President",
446            "Director",
447            "Manager",
448            "Chairman",
449            "Senator",
450            "Governor",
451            "Mayor",
452            "Minister",
453            "Dr.",
454            "Prof.",
455            "Rev.",
456            "founder",
457            "co-founder",
458            "editor",
459        ];
460
461        let lower = content.to_lowercase();
462        role_indicators
463            .iter()
464            .any(|r| lower.contains(&r.to_lowercase()))
465    }
466
467    /// Check if content is a location qualifier.
468    fn is_location_qualifier(&self, content: &str) -> bool {
469        let qualifiers = [
470            "UK",
471            "US",
472            "USA",
473            "England",
474            "Scotland",
475            "Wales",
476            "Massachusetts",
477            "California",
478            "Texas",
479            "New York",
480            "Ontario",
481            "Quebec",
482            "Bavaria",
483            "Saxony",
484        ];
485
486        qualifiers.iter().any(|q| content.contains(q))
487    }
488}
489
490/// Alias pair extracted from parentheticals.
491///
492/// Used for feeding into coalesce module.
493#[derive(Debug, Clone, Serialize, Deserialize)]
494pub struct AliasPair {
495    /// Primary name/surface form
496    pub primary: String,
497    /// Alias name/surface form
498    pub alias: String,
499    /// Source document ID
500    pub doc_id: Option<String>,
501    /// Confidence in this alias relationship
502    pub confidence: f64,
503    /// Type of alias relationship
504    pub alias_type: ParentheticalType,
505}
506
507impl AliasPair {
508    /// Create from a parenthetical.
509    pub fn from_parenthetical(paren: &Parenthetical, doc_id: Option<&str>) -> Option<Self> {
510        if !paren.is_alias {
511            return None;
512        }
513
514        Some(Self {
515            primary: paren.antecedent.clone(),
516            alias: paren.content.clone(),
517            doc_id: doc_id.map(|s| s.to_string()),
518            confidence: paren.confidence,
519            alias_type: paren.parenthetical_type.clone(),
520        })
521    }
522}
523
524/// Extract alias pairs from text for coalescing.
525pub fn extract_aliases(text: &str, doc_id: Option<&str>) -> Vec<AliasPair> {
526    let extractor = ParentheticalExtractor::new();
527    let parentheticals = extractor.extract(text);
528
529    parentheticals
530        .iter()
531        .filter_map(|p| AliasPair::from_parenthetical(p, doc_id))
532        .collect()
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538    use crate::offset::TextSpan;
539
540    #[test]
541    fn test_abbreviation_extraction() {
542        let extractor = ParentheticalExtractor::new();
543        let text = "The World Health Organization (WHO) announced new guidelines.";
544        let results = extractor.extract(text);
545
546        assert_eq!(results.len(), 1);
547        assert_eq!(results[0].antecedent, "The World Health Organization");
548        assert_eq!(results[0].content, "WHO");
549        assert_eq!(
550            results[0].parenthetical_type,
551            ParentheticalType::Abbreviation
552        );
553        assert!(results[0].is_alias);
554    }
555
556    #[test]
557    fn test_ticker_extraction() {
558        let extractor = ParentheticalExtractor::new();
559        let text = "Apple Inc. (AAPL) reported strong earnings.";
560        let results = extractor.extract(text);
561
562        assert_eq!(results.len(), 1);
563        assert_eq!(results[0].content, "AAPL");
564        assert_eq!(results[0].parenthetical_type, ParentheticalType::Ticker);
565    }
566
567    #[test]
568    fn test_temporal_bounds() {
569        let extractor = ParentheticalExtractor::new();
570        let text = "Napoleon Bonaparte (1769-1821) was Emperor of France.";
571        let results = extractor.extract(text);
572
573        assert_eq!(results.len(), 1);
574        assert_eq!(results[0].content, "1769-1821");
575        assert_eq!(
576            results[0].parenthetical_type,
577            ParentheticalType::TemporalBounds
578        );
579    }
580
581    #[test]
582    fn test_translation() {
583        let extractor = ParentheticalExtractor::new();
584        let text = "北京 (Beijing) is the capital.";
585        let results = extractor.extract(text);
586
587        assert_eq!(results.len(), 1);
588        assert_eq!(
589            results[0].parenthetical_type,
590            ParentheticalType::Translation
591        );
592    }
593
594    #[test]
595    fn test_role_extraction() {
596        let extractor = ParentheticalExtractor::new();
597        let text = "Tim Cook (CEO of Apple) spoke at the conference.";
598        let results = extractor.extract(text);
599
600        assert_eq!(results.len(), 1);
601        assert_eq!(results[0].parenthetical_type, ParentheticalType::Role);
602    }
603
604    #[test]
605    fn test_parenthetical_offsets_are_character_offsets_with_unicode_prefix() {
606        // ü is multi-byte, so byte offsets != char offsets; ensure we store char offsets.
607        let extractor = ParentheticalExtractor::new();
608        let text = "Müller (CEO) spoke.";
609        let results = extractor.extract(text);
610        assert_eq!(results.len(), 1);
611
612        let p = &results[0];
613        let span_text = TextSpan::from_chars(text, p.start, p.end).extract(text);
614        assert_eq!(span_text, "Müller (CEO)");
615
616        let content_text = TextSpan::from_chars(text, p.content_start, p.content_end).extract(text);
617        assert_eq!(content_text, "CEO");
618    }
619
620    #[test]
621    fn test_alias_pair_extraction() {
622        let text = "The United Nations (UN) held a meeting.";
623        let aliases = extract_aliases(text, Some("doc1"));
624
625        assert_eq!(aliases.len(), 1);
626        assert_eq!(aliases[0].primary, "The United Nations");
627        assert_eq!(aliases[0].alias, "UN");
628        assert_eq!(aliases[0].doc_id, Some("doc1".to_string()));
629    }
630
631    #[test]
632    fn test_multiple_parentheticals() {
633        let extractor = ParentheticalExtractor::new();
634        let text = "Microsoft Corp. (MSFT) and Apple Inc. (AAPL) are tech giants.";
635        let results = extractor.extract(text);
636
637        assert_eq!(results.len(), 2);
638    }
639
640    #[test]
641    fn test_nested_parentheses_skipped() {
642        let extractor = ParentheticalExtractor::new();
643        let text = "Complex formula (f(x) = x^2) is quadratic.";
644        let results = extractor.extract(text);
645
646        // Should still extract the outer parenthetical
647        assert_eq!(results.len(), 1);
648    }
649}