Skip to main content

anno/preprocess/
apposition.rs

1//! Apposition and Alias Pattern Extraction.
2//!
3//! Extracts alias relationships from various linguistic patterns beyond parentheticals.
4//!
5//! # Supported Patterns
6//!
7//! | Pattern | Example | Type |
8//! |---------|---------|------|
9//! | **Also known as** | "Peter Parker, also known as Spider-Man" | [`AppositionType::AlsoKnownAs`] |
10//! | **AKA** | "Ringo Starr, aka Richard Starkey" | [`AppositionType::Aka`] |
11//! | **Born** | "Lady Gaga, born Stefani Germanotta" | [`AppositionType::BirthName`] |
12//! | **Formerly** | "Mumbai, formerly Bombay" | [`AppositionType::FormerlyKnownAs`] |
13//! | **Now** | "Facebook, now Meta" | [`AppositionType::NowKnownAs`] |
14//! | **Nickname** | "Dwayne 'The Rock' Johnson" | [`AppositionType::Nickname`] |
15//! | **Colon** | "AWS: Amazon Web Services" | [`AppositionType::ColonExpansion`] |
16//! | **Née** | "Hillary Clinton, née Rodham" | [`AppositionType::Nee`] |
17//! | **Better known as** | "Marshall Mathers, better known as Eminem" | [`AppositionType::BetterKnownAs`] |
18//!
19//! # Canonical vs. Alternate Forms
20//!
21//! Different patterns have different directionality:
22//!
23//! - **"born X"**: The birth name (X) is canonical (legal name)
24//! - **"better known as X"**: X is canonical (more recognized)
25//! - **"formerly X"**: Current name is canonical
26//! - **"aka X"**: Primary mention is canonical
27//!
28//! Use [`Apposition::canonical()`] and [`Apposition::alternate()`] to get the correct form.
29//!
30//! # Integration with Coalesce
31//!
32//! Extracted aliases integrate with `anno::coalesce` for cross-document entity linking:
33//!
34//! ```text
35//! Appositions ──► AliasPairs ──► Coalesce ──► Unified Identities
36//! ```
37//!
38//! # Example
39//!
40//! ```rust
41//! use anno::preprocess::apposition::{AppositionExtractor, AppositionType};
42//!
43//! let extractor = AppositionExtractor::new();
44//! let text = "Lady Gaga, born Stefani Germanotta, is a singer.";
45//! let results = extractor.extract(text);
46//!
47//! assert_eq!(results.len(), 1);
48//! assert_eq!(results[0].primary, "Lady Gaga");
49//! assert_eq!(results[0].alias, "Stefani Germanotta");
50//! assert_eq!(results[0].apposition_type, AppositionType::BirthName);
51//!
52//! // Birth name is the canonical (legal) form
53//! assert_eq!(results[0].canonical(), "Stefani Germanotta");
54//! assert_eq!(results[0].alternate(), "Lady Gaga");
55//! ```
56//!
57//! # Linguistic Background
58//!
59//! These patterns are related to but distinct from:
60//!
61//! - **Appositions**: Noun phrases that rename another noun ("Obama, the president")
62//! - **Parentheticals**: Insertions that can be removed without loss of grammaticality
63//! - **Copular constructions**: "X is Y" relationships
64//!
65//! This module focuses specifically on **alias-introducing patterns** that establish
66//! identity relationships between different surface forms of the same entity.
67
68use serde::{Deserialize, Serialize};
69
70/// Type of apposition/alias pattern.
71///
72/// Each type has different semantics for which form is "canonical":
73///
74/// - `BirthName`, `RealName`: The alias is canonical (legal name)
75/// - `BetterKnownAs`, `NowKnownAs`: The alias is canonical (current/famous name)
76/// - `FormerlyKnownAs`, `Aka`, `AlsoKnownAs`: The primary is canonical
77#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
78pub enum AppositionType {
79    /// Standard appositive: "Obama, the president, ..."
80    Appositive,
81    /// Also known as: "Spider-Man, also known as Peter Parker"
82    AlsoKnownAs,
83    /// AKA abbreviation: "Ringo Starr, aka Richard Starkey"
84    Aka,
85    /// Nickname in quotes: "Dwayne 'The Rock' Johnson"
86    Nickname,
87    /// Birth name: "Lady Gaga, born Stefani Germanotta"
88    BirthName,
89    /// Former name: "Mumbai, formerly Bombay"
90    FormerlyKnownAs,
91    /// Renamed: "Meta, formerly Facebook"
92    Renamed,
93    /// Now known as: "Facebook, now Meta"
94    NowKnownAs,
95    /// Colon expansion: "AWS: Amazon Web Services"
96    ColonExpansion,
97    /// Or alternative: "Myanmar (or Burma)"
98    OrAlternative,
99    /// Real name: "Eminem, real name Marshall Mathers"
100    RealName,
101    /// Better known as: "Marshall Mathers, better known as Eminem"
102    BetterKnownAs,
103    /// Née (maiden name): "Hillary Clinton, née Rodham"
104    Nee,
105    /// Styled as: "Prince, styled as ꛦ for a period"
106    StyledAs,
107    /// Generic alias
108    #[default]
109    Generic,
110}
111
112/// An extracted apposition/alias relationship.
113///
114/// Represents an alias relationship between two surface forms of an entity.
115/// The `primary` field contains the first-mentioned form, and `alias` contains
116/// the form introduced by the pattern.
117///
118/// Use [`canonical()`](Apposition::canonical) and [`alternate()`](Apposition::alternate)
119/// to get the semantically appropriate form regardless of mention order.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct Apposition {
122    /// Primary entity text (first-mentioned form)
123    pub primary: String,
124    /// Alias/alternate text (introduced by pattern)
125    pub alias: String,
126    /// Start offset of entire span in source text
127    pub start: usize,
128    /// End offset of entire span in source text
129    pub end: usize,
130    /// Type of apposition pattern
131    pub apposition_type: AppositionType,
132    /// Confidence in extraction (0.0-1.0)
133    pub confidence: f64,
134    /// Direction: true if primary→alias is the name→alias direction
135    pub primary_is_canonical: bool,
136}
137
138impl Apposition {
139    /// Create a new apposition with default settings.
140    ///
141    /// By default, assumes the primary (first-mentioned) form is canonical.
142    pub fn new(primary: &str, alias: &str, start: usize, end: usize) -> Self {
143        Self {
144            primary: primary.to_string(),
145            alias: alias.to_string(),
146            start,
147            end,
148            apposition_type: AppositionType::Generic,
149            confidence: 0.7,
150            primary_is_canonical: true,
151        }
152    }
153
154    /// Set the apposition type.
155    #[must_use]
156    pub fn with_type(mut self, atype: AppositionType) -> Self {
157        self.apposition_type = atype;
158        self
159    }
160
161    /// Mark that the alias is the canonical form.
162    ///
163    /// Use for patterns like "born X" or "better known as X" where the
164    /// introduced form is the canonical one.
165    #[must_use]
166    pub fn alias_is_canonical(mut self) -> Self {
167        self.primary_is_canonical = false;
168        self
169    }
170
171    /// Get the canonical (preferred) form.
172    ///
173    /// # Examples
174    ///
175    /// - "Lady Gaga, born Stefani Germanotta" → "Stefani Germanotta" (legal name)
176    /// - "Mumbai, formerly Bombay" → "Mumbai" (current name)
177    pub fn canonical(&self) -> &str {
178        if self.primary_is_canonical {
179            &self.primary
180        } else {
181            &self.alias
182        }
183    }
184
185    /// Get the alternate (non-canonical) form.
186    pub fn alternate(&self) -> &str {
187        if self.primary_is_canonical {
188            &self.alias
189        } else {
190            &self.primary
191        }
192    }
193}
194
195/// Extractor for appositions and alias patterns.
196///
197/// # Configuration
198///
199/// Each pattern category can be individually enabled/disabled:
200///
201/// ```rust
202/// use anno::preprocess::apposition::AppositionExtractor;
203///
204/// let extractor = AppositionExtractor::new();
205/// // All patterns enabled by default
206/// ```
207#[derive(Debug, Clone, Default)]
208pub struct AppositionExtractor {
209    /// Extract appositives (comma-delimited)
210    #[allow(dead_code)] // Future configurability
211    extract_appositives: bool,
212    /// Extract AKA patterns
213    extract_aka: bool,
214    /// Extract nickname patterns
215    extract_nicknames: bool,
216    /// Extract formerly/now patterns
217    extract_rename: bool,
218    /// Extract colon expansions
219    extract_colon: bool,
220}
221
222impl AppositionExtractor {
223    /// Create a new extractor with all patterns enabled.
224    pub fn new() -> Self {
225        Self {
226            extract_appositives: true,
227            extract_aka: true,
228            extract_nicknames: true,
229            extract_rename: true,
230            extract_colon: true,
231        }
232    }
233
234    /// Extract all alias patterns from text.
235    ///
236    /// Returns a deduplicated list sorted by position. Overlapping
237    /// extractions are resolved by keeping the highest-confidence one.
238    pub fn extract(&self, text: &str) -> Vec<Apposition> {
239        let mut results = Vec::new();
240
241        // AKA patterns
242        if self.extract_aka {
243            results.extend(self.extract_aka_patterns(text));
244        }
245
246        // Born patterns
247        results.extend(self.extract_born_patterns(text));
248
249        // Formerly patterns
250        if self.extract_rename {
251            results.extend(self.extract_rename_patterns(text));
252        }
253
254        // Nickname patterns
255        if self.extract_nicknames {
256            results.extend(self.extract_nickname_patterns(text));
257        }
258
259        // Colon expansions
260        if self.extract_colon {
261            results.extend(self.extract_colon_patterns(text));
262        }
263
264        // Née patterns
265        results.extend(self.extract_nee_patterns(text));
266
267        // Sort by position, deduplicate overlapping
268        results.sort_by_key(|a| a.start);
269        self.remove_overlaps(results)
270    }
271
272    /// Extract "also known as" / "aka" patterns.
273    fn extract_aka_patterns(&self, text: &str) -> Vec<Apposition> {
274        let mut results = Vec::new();
275        let _lower = text.to_lowercase();
276
277        // Pattern: "X, also known as Y"
278        let patterns = [
279            (
280                r"([A-Z][^,]+),\s*also known as\s+([A-Z][^,.]+)",
281                AppositionType::AlsoKnownAs,
282                true,
283            ),
284            (
285                r"([A-Z][^,]+),\s*a\.k\.a\.?\s+([A-Z][^,.]+)",
286                AppositionType::Aka,
287                true,
288            ),
289            (
290                r"([A-Z][^,]+),\s*aka\s+([A-Z][^,.]+)",
291                AppositionType::Aka,
292                true,
293            ),
294            (
295                r"([A-Z][^,]+),\s*better known as\s+([A-Z][^,.]+)",
296                AppositionType::BetterKnownAs,
297                false,
298            ),
299            (
300                r"([A-Z][^,]+),\s*real name\s+([A-Z][^,.]+)",
301                AppositionType::RealName,
302                false,
303            ),
304        ];
305
306        for (pattern, atype, primary_canonical) in &patterns {
307            if let Ok(re) = regex::Regex::new(pattern) {
308                for cap in re.captures_iter(text) {
309                    if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
310                        let mut appo = Apposition::new(
311                            m1.as_str().trim(),
312                            m2.as_str().trim(),
313                            cap.get(0).expect("regex match should have group 0").start(),
314                            cap.get(0).expect("regex match should have group 0").end(),
315                        )
316                        .with_type(atype.clone());
317
318                        if !*primary_canonical {
319                            appo = appo.alias_is_canonical();
320                        }
321                        appo.confidence = 0.9;
322
323                        results.push(appo);
324                    }
325                }
326            }
327        }
328
329        results
330    }
331
332    /// Extract "born X" patterns.
333    fn extract_born_patterns(&self, text: &str) -> Vec<Apposition> {
334        let mut results = Vec::new();
335
336        // Pattern: "X, born Y" or "X (born Y)"
337        let patterns = [
338            r"([A-Z][A-Za-z\s]+),\s*born\s+([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
339            r"([A-Z][A-Za-z\s]+)\s*\(born\s+([A-Z][A-Za-z\s]+)\)",
340        ];
341
342        for pattern in &patterns {
343            if let Ok(re) = regex::Regex::new(pattern) {
344                for cap in re.captures_iter(text) {
345                    if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
346                        let appo = Apposition::new(
347                            m1.as_str().trim(),
348                            m2.as_str().trim(),
349                            cap.get(0).expect("regex match should have group 0").start(),
350                            cap.get(0).expect("regex match should have group 0").end(),
351                        )
352                        .with_type(AppositionType::BirthName)
353                        .alias_is_canonical(); // Birth name is the canonical legal name
354
355                        results.push(appo);
356                    }
357                }
358            }
359        }
360
361        results
362    }
363
364    /// Extract "formerly" / "now" / "previously" patterns.
365    fn extract_rename_patterns(&self, text: &str) -> Vec<Apposition> {
366        let mut results = Vec::new();
367
368        let patterns = [
369            (
370                r"([A-Z][A-Za-z\s]+),\s*formerly\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
371                AppositionType::FormerlyKnownAs,
372                true,
373            ),
374            (
375                r"([A-Z][A-Za-z\s]+),\s*previously\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
376                AppositionType::FormerlyKnownAs,
377                true,
378            ),
379            (
380                r"([A-Z][A-Za-z\s]+),\s*now\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
381                AppositionType::NowKnownAs,
382                false,
383            ),
384            (
385                r"([A-Z][A-Za-z\s]+),\s*currently\s+(?:known as\s+)?([A-Z][A-Za-z\s]+?)(?:[,.]|$)",
386                AppositionType::NowKnownAs,
387                false,
388            ),
389        ];
390
391        for (pattern, atype, primary_canonical) in &patterns {
392            if let Ok(re) = regex::Regex::new(pattern) {
393                for cap in re.captures_iter(text) {
394                    if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
395                        let mut appo = Apposition::new(
396                            m1.as_str().trim(),
397                            m2.as_str().trim(),
398                            cap.get(0).expect("regex match should have group 0").start(),
399                            cap.get(0).expect("regex match should have group 0").end(),
400                        )
401                        .with_type(atype.clone());
402
403                        if !*primary_canonical {
404                            appo = appo.alias_is_canonical();
405                        }
406                        appo.confidence = 0.85;
407
408                        results.push(appo);
409                    }
410                }
411            }
412        }
413
414        results
415    }
416
417    /// Extract nickname patterns (quotes).
418    fn extract_nickname_patterns(&self, text: &str) -> Vec<Apposition> {
419        let mut results = Vec::new();
420
421        // Pattern: "FirstName 'Nickname' LastName" or "FirstName "Nickname" LastName"
422        let patterns = [
423            r#"([A-Z][a-z]+)\s+'([A-Z][^']+)'\s+([A-Z][a-z]+)"#,
424            r#"([A-Z][a-z]+)\s+"([A-Z][^"]+)"\s+([A-Z][a-z]+)"#,
425            r#"([A-Z][a-z]+)\s+'([A-Z][^']+)'\s+([A-Z][a-z]+)"#, // curly quotes
426        ];
427
428        for pattern in &patterns {
429            if let Ok(re) = regex::Regex::new(pattern) {
430                for cap in re.captures_iter(text) {
431                    if let (Some(first), Some(nick), Some(last)) =
432                        (cap.get(1), cap.get(2), cap.get(3))
433                    {
434                        let full_name = format!("{} {}", first.as_str(), last.as_str());
435                        let appo = Apposition::new(
436                            &full_name,
437                            nick.as_str(),
438                            cap.get(0).expect("regex match should have group 0").start(),
439                            cap.get(0).expect("regex match should have group 0").end(),
440                        )
441                        .with_type(AppositionType::Nickname);
442
443                        results.push(appo);
444                    }
445                }
446            }
447        }
448
449        results
450    }
451
452    /// Extract colon expansion patterns.
453    fn extract_colon_patterns(&self, text: &str) -> Vec<Apposition> {
454        let mut results = Vec::new();
455
456        // Pattern: "ABBREV: Full Name" - match capitalized name after colon
457        // The full form typically ends at lowercase word start or punctuation
458        if let Ok(re) = regex::Regex::new(r"([A-Z]{2,8}):\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)") {
459            for cap in re.captures_iter(text) {
460                if let (Some(abbrev), Some(full)) = (cap.get(1), cap.get(2)) {
461                    let full_text = full.as_str().trim();
462                    let group_0 = cap.get(0).expect("regex match should have group 0");
463                    let appo =
464                        Apposition::new(full_text, abbrev.as_str(), group_0.start(), group_0.end())
465                            .with_type(AppositionType::ColonExpansion);
466
467                    results.push(appo);
468                }
469            }
470        }
471
472        results
473    }
474
475    /// Extract "née" patterns for maiden names.
476    fn extract_nee_patterns(&self, text: &str) -> Vec<Apposition> {
477        let mut results = Vec::new();
478
479        let patterns = [
480            r"([A-Z][A-Za-z\s]+),\s*née\s+([A-Z][a-z]+)",
481            r"([A-Z][A-Za-z\s]+),\s*nee\s+([A-Z][a-z]+)",
482            r"([A-Z][A-Za-z\s]+)\s*\(née\s+([A-Z][a-z]+)\)",
483        ];
484
485        for pattern in &patterns {
486            if let Ok(re) = regex::Regex::new(pattern) {
487                for cap in re.captures_iter(text) {
488                    if let (Some(m1), Some(m2)) = (cap.get(1), cap.get(2)) {
489                        let appo = Apposition::new(
490                            m1.as_str().trim(),
491                            m2.as_str().trim(),
492                            cap.get(0).expect("regex match should have group 0").start(),
493                            cap.get(0).expect("regex match should have group 0").end(),
494                        )
495                        .with_type(AppositionType::Nee);
496
497                        results.push(appo);
498                    }
499                }
500            }
501        }
502
503        results
504    }
505
506    /// Remove overlapping extractions, keeping highest confidence.
507    fn remove_overlaps(&self, mut appos: Vec<Apposition>) -> Vec<Apposition> {
508        appos.sort_by(|a, b| {
509            b.confidence
510                .partial_cmp(&a.confidence)
511                .unwrap_or(std::cmp::Ordering::Equal)
512        });
513
514        let mut result = Vec::new();
515        for appo in appos {
516            let overlaps = result
517                .iter()
518                .any(|a: &Apposition| appo.start < a.end && appo.end > a.start);
519            if !overlaps {
520                result.push(appo);
521            }
522        }
523
524        result.sort_by_key(|a| a.start);
525        result
526    }
527}
528
529/// Combined alias extraction from all sources.
530///
531/// Combines parentheticals and appositions into a unified alias list.
532/// Returns tuples of (canonical, alternate, confidence).
533///
534/// # Example
535///
536/// ```rust
537/// use anno::preprocess::apposition::extract_all_aliases;
538///
539/// let text = "Apple Inc. (AAPL), formerly Apple Computer, is based in Cupertino.";
540/// let aliases = extract_all_aliases(text);
541///
542/// // Will find both the ticker parenthetical and the formerly pattern
543/// for (canonical, alternate, confidence) in aliases {
544///     println!("{} = {} ({:.2})", canonical, alternate, confidence);
545/// }
546/// ```
547pub fn extract_all_aliases(text: &str) -> Vec<(String, String, f64)> {
548    use super::parenthetical::ParentheticalExtractor;
549
550    let mut aliases = Vec::new();
551
552    // Parentheticals
553    let paren_ext = ParentheticalExtractor::new();
554    for paren in paren_ext.extract(text) {
555        if paren.is_alias {
556            aliases.push((paren.antecedent, paren.content, paren.confidence));
557        }
558    }
559
560    // Appositions
561    let appo_ext = AppositionExtractor::new();
562    for appo in appo_ext.extract(text) {
563        aliases.push((
564            appo.canonical().to_string(),
565            appo.alternate().to_string(),
566            appo.confidence,
567        ));
568    }
569
570    aliases
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    #[test]
578    fn test_aka_pattern() {
579        let extractor = AppositionExtractor::new();
580        let text = "Peter Parker, also known as Spider-Man, saved the city.";
581        let results = extractor.extract(text);
582
583        assert_eq!(results.len(), 1);
584        assert_eq!(results[0].primary, "Peter Parker");
585        assert_eq!(results[0].alias, "Spider-Man");
586        assert_eq!(results[0].apposition_type, AppositionType::AlsoKnownAs);
587    }
588
589    #[test]
590    fn test_born_pattern() {
591        let extractor = AppositionExtractor::new();
592        let text = "Lady Gaga, born Stefani Germanotta, is a famous singer.";
593        let results = extractor.extract(text);
594
595        assert_eq!(results.len(), 1);
596        assert_eq!(results[0].primary, "Lady Gaga");
597        assert_eq!(results[0].alias, "Stefani Germanotta");
598        assert_eq!(results[0].apposition_type, AppositionType::BirthName);
599        // Birth name is canonical
600        assert_eq!(results[0].canonical(), "Stefani Germanotta");
601    }
602
603    #[test]
604    fn test_formerly_pattern() {
605        let extractor = AppositionExtractor::new();
606        let text = "Mumbai, formerly Bombay, is India's largest city.";
607        let results = extractor.extract(text);
608
609        assert_eq!(results.len(), 1);
610        assert_eq!(results[0].primary, "Mumbai");
611        assert_eq!(results[0].alias, "Bombay");
612        assert_eq!(results[0].apposition_type, AppositionType::FormerlyKnownAs);
613    }
614
615    #[test]
616    fn test_nickname_pattern() {
617        let extractor = AppositionExtractor::new();
618        let text = "Dwayne 'The Rock' Johnson is an actor.";
619        let results = extractor.extract(text);
620
621        assert_eq!(results.len(), 1);
622        assert_eq!(results[0].primary, "Dwayne Johnson");
623        assert_eq!(results[0].alias, "The Rock");
624        assert_eq!(results[0].apposition_type, AppositionType::Nickname);
625    }
626
627    #[test]
628    fn test_colon_pattern() {
629        let extractor = AppositionExtractor::new();
630        let text = "AWS: Amazon Web Services provides cloud computing.";
631        let results = extractor.extract(text);
632
633        assert_eq!(results.len(), 1);
634        assert_eq!(results[0].alias, "AWS");
635        assert_eq!(results[0].primary, "Amazon Web Services");
636    }
637
638    #[test]
639    fn test_nee_pattern() {
640        let extractor = AppositionExtractor::new();
641        let text = "Hillary Clinton, née Rodham, was Secretary of State.";
642        let results = extractor.extract(text);
643
644        assert_eq!(results.len(), 1);
645        assert_eq!(results[0].alias, "Rodham");
646        assert_eq!(results[0].apposition_type, AppositionType::Nee);
647    }
648
649    #[test]
650    fn test_combined_extraction() {
651        let text = "Apple Inc. (AAPL), formerly Apple Computer, launched the iPhone.";
652        let aliases = extract_all_aliases(text);
653
654        // Should find both the ticker parenthetical and the formerly pattern
655        assert!(!aliases.is_empty());
656    }
657
658    #[test]
659    fn test_better_known_as() {
660        let extractor = AppositionExtractor::new();
661        let text = "Marshall Mathers, better known as Eminem, is a rapper.";
662        let results = extractor.extract(text);
663
664        assert_eq!(results.len(), 1);
665        assert_eq!(results[0].apposition_type, AppositionType::BetterKnownAs);
666        // "Eminem" is the canonical (better known) form
667        assert_eq!(results[0].canonical(), "Eminem");
668    }
669}