Skip to main content

anno/linking/
confusables.rs

1//! Hard negative (confusable) entity mining for disambiguation.
2//!
3//! Based on the approach from "Contrastive Entity Coreference and Disambiguation for
4//! Historical Texts" (Arora et al. 2024), which introduces WikiConfusables:
5//!
6//! > We source hard negative examples from Wikipedia disambiguation pages, which
7//! > list entities that share the same name but refer to different real-world
8//! > referents (e.g., "John Smith" could be a footballer, politician, or musician).
9//!
10//! # Hard Negative Sources
11//!
12//! 1. **Wikipedia Disambiguation Pages**: "John Smith" → multiple Q-IDs
13//! 2. **Name Collisions**: Same surface form, different entities
14//! 3. **Transliteration Variants**: "Beijing"/"Peking", "Moscow"/"Moskva"
15//! 4. **OCR Confusables**: "Gnome"/"Gnorne" (historical documents)
16//! 5. **Temporal Confusables**: "President Bush" (41 vs 43)
17//!
18//! # Usage
19//!
20//! ```rust
21//! use anno::linking::confusables::{ConfusableSet, ConfusableReason};
22//!
23//! let mut confusables = ConfusableSet::new("John Smith");
24//!
25//! // Add from disambiguation page
26//! confusables.add("Q123456", ConfusableReason::DisambiguationPage,
27//!     Some("American politician"));
28//! confusables.add("Q789012", ConfusableReason::DisambiguationPage,
29//!     Some("English footballer"));
30//!
31//! // Use for contrastive training
32//! let pairs = confusables.to_training_pairs("Q123456");
33//! ```
34
35use serde::{Deserialize, Serialize};
36use std::collections::HashMap;
37
38/// Reason why two entities are confusable.
39#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
40pub enum ConfusableReason {
41    /// From Wikipedia/Wikidata disambiguation page
42    DisambiguationPage,
43    /// Share the same surface form/name
44    NameCollision,
45    /// Transliteration or spelling variant
46    TransliterationVariant,
47    /// OCR error pattern (common in historical docs)
48    OcrError,
49    /// Same title/role at different times
50    TemporalAmbiguity,
51    /// Parent/child or family with same name
52    FamilialAmbiguity,
53    /// Organization name changed over time
54    NameChange,
55    /// Fictional vs real entity with same name
56    FictionalVsReal,
57}
58
59/// A confusable entity candidate (hard negative).
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ConfusableEntity {
62    /// Knowledge base ID
63    pub kb_id: String,
64    /// Canonical label
65    pub label: String,
66    /// Short description to distinguish
67    pub description: Option<String>,
68    /// Why this is confusable
69    pub reason: ConfusableReason,
70    /// Type information (human, organization, etc.)
71    pub entity_type: Option<String>,
72    /// Difficulty score (0-1, higher = harder to distinguish)
73    pub difficulty: f64,
74}
75
76impl ConfusableEntity {
77    /// Create a new confusable entity.
78    pub fn new(kb_id: &str, reason: ConfusableReason) -> Self {
79        Self {
80            kb_id: kb_id.to_string(),
81            label: String::new(),
82            description: None,
83            reason,
84            entity_type: None,
85            difficulty: 0.5,
86        }
87    }
88
89    /// Set label.
90    pub fn with_label(mut self, label: &str) -> Self {
91        self.label = label.to_string();
92        self
93    }
94
95    /// Set description.
96    pub fn with_description(mut self, desc: &str) -> Self {
97        self.description = Some(desc.to_string());
98        self
99    }
100
101    /// Set entity type.
102    pub fn with_entity_type(mut self, entity_type: &str) -> Self {
103        self.entity_type = Some(entity_type.to_string());
104        self
105    }
106
107    /// Set difficulty.
108    pub fn with_difficulty(mut self, difficulty: f64) -> Self {
109        self.difficulty = difficulty.clamp(0.0, 1.0);
110        self
111    }
112}
113
114/// A set of confusable entities for a given surface form.
115#[derive(Debug, Clone, Default, Serialize, Deserialize)]
116pub struct ConfusableSet {
117    /// The surface form (e.g., "John Smith")
118    pub surface_form: String,
119    /// All confusable entities
120    pub entities: Vec<ConfusableEntity>,
121}
122
123impl ConfusableSet {
124    /// Create a new confusable set.
125    pub fn new(surface_form: &str) -> Self {
126        Self {
127            surface_form: surface_form.to_string(),
128            entities: Vec::new(),
129        }
130    }
131
132    /// Add a confusable entity.
133    pub fn add(&mut self, kb_id: &str, reason: ConfusableReason, description: Option<&str>) {
134        let mut entity = ConfusableEntity::new(kb_id, reason);
135        if let Some(desc) = description {
136            entity = entity.with_description(desc);
137        }
138        self.entities.push(entity);
139    }
140
141    /// Add a full confusable entity.
142    pub fn add_entity(&mut self, entity: ConfusableEntity) {
143        self.entities.push(entity);
144    }
145
146    /// Generate training pairs for contrastive learning.
147    ///
148    /// Given a positive entity ID, returns (positive, negative) pairs
149    /// for all other confusable entities.
150    pub fn to_training_pairs(&self, positive_kb_id: &str) -> Vec<TrainingPair> {
151        self.entities
152            .iter()
153            .filter(|e| e.kb_id != positive_kb_id)
154            .map(|negative| TrainingPair {
155                surface_form: self.surface_form.clone(),
156                positive_kb_id: positive_kb_id.to_string(),
157                negative_kb_id: negative.kb_id.clone(),
158                negative_description: negative.description.clone(),
159                difficulty: negative.difficulty,
160            })
161            .collect()
162    }
163
164    /// Number of confusable entities.
165    pub fn len(&self) -> usize {
166        self.entities.len()
167    }
168
169    /// Check if empty.
170    pub fn is_empty(&self) -> bool {
171        self.entities.is_empty()
172    }
173
174    /// Get entities by reason.
175    pub fn filter_by_reason(&self, reason: &ConfusableReason) -> Vec<&ConfusableEntity> {
176        self.entities
177            .iter()
178            .filter(|e| &e.reason == reason)
179            .collect()
180    }
181}
182
183/// A training pair for contrastive learning.
184#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct TrainingPair {
186    /// Surface form (anchor)
187    pub surface_form: String,
188    /// Positive entity KB ID
189    pub positive_kb_id: String,
190    /// Negative entity KB ID (hard negative)
191    pub negative_kb_id: String,
192    /// Description of negative (for context)
193    pub negative_description: Option<String>,
194    /// Difficulty score
195    pub difficulty: f64,
196}
197
198/// Registry of confusable entities.
199///
200/// Maps surface forms to their confusable entity sets.
201#[derive(Debug, Clone, Default, Serialize, Deserialize)]
202pub struct ConfusableRegistry {
203    /// Map from normalized surface form to confusable set
204    entries: HashMap<String, ConfusableSet>,
205}
206
207impl ConfusableRegistry {
208    /// Create a new registry.
209    pub fn new() -> Self {
210        Self::default()
211    }
212
213    /// Add or merge a confusable set.
214    pub fn add(&mut self, set: ConfusableSet) {
215        let key = set.surface_form.to_lowercase();
216        self.entries
217            .entry(key)
218            .and_modify(|existing| {
219                for entity in &set.entities {
220                    if !existing.entities.iter().any(|e| e.kb_id == entity.kb_id) {
221                        existing.entities.push(entity.clone());
222                    }
223                }
224            })
225            .or_insert(set);
226    }
227
228    /// Look up confusables for a surface form.
229    pub fn get(&self, surface_form: &str) -> Option<&ConfusableSet> {
230        self.entries.get(&surface_form.to_lowercase())
231    }
232
233    /// Number of surface forms tracked.
234    pub fn len(&self) -> usize {
235        self.entries.len()
236    }
237
238    /// Check if empty.
239    pub fn is_empty(&self) -> bool {
240        self.entries.is_empty()
241    }
242
243    /// Load well-known confusables for testing/demo.
244    ///
245    /// These are famous disambiguation cases that frequently appear
246    /// in historical and modern documents.
247    pub fn with_well_known(mut self) -> Self {
248        // "John Smith" - extremely common name
249        let mut john_smith = ConfusableSet::new("John Smith");
250        john_smith.add_entity(
251            ConfusableEntity::new("Q217557", ConfusableReason::DisambiguationPage)
252                .with_label("John Smith")
253                .with_description("English soldier and explorer, founder of Jamestown")
254                .with_entity_type("human")
255                .with_difficulty(0.8),
256        );
257        john_smith.add_entity(
258            ConfusableEntity::new("Q556859", ConfusableReason::DisambiguationPage)
259                .with_label("John Smith")
260                .with_description("British Labour Party leader")
261                .with_entity_type("human")
262                .with_difficulty(0.7),
263        );
264        john_smith.add_entity(
265            ConfusableEntity::new("Q23489016", ConfusableReason::FictionalVsReal)
266                .with_label("John Smith")
267                .with_description("Doctor Who character")
268                .with_entity_type("fictional character")
269                .with_difficulty(0.6),
270        );
271        self.add(john_smith);
272
273        // "George Bush" - temporal ambiguity
274        let mut george_bush = ConfusableSet::new("George Bush");
275        george_bush.add_entity(
276            ConfusableEntity::new("Q207", ConfusableReason::TemporalAmbiguity)
277                .with_label("George W. Bush")
278                .with_description("43rd President of the United States")
279                .with_entity_type("human")
280                .with_difficulty(0.9),
281        );
282        george_bush.add_entity(
283            ConfusableEntity::new("Q23505", ConfusableReason::TemporalAmbiguity)
284                .with_label("George H. W. Bush")
285                .with_description("41st President of the United States")
286                .with_entity_type("human")
287                .with_difficulty(0.9),
288        );
289        self.add(george_bush);
290
291        // "President Bush" - same issue
292        let mut president_bush = ConfusableSet::new("President Bush");
293        president_bush.add_entity(
294            ConfusableEntity::new("Q207", ConfusableReason::TemporalAmbiguity)
295                .with_label("George W. Bush")
296                .with_description("43rd President of the United States")
297                .with_entity_type("human")
298                .with_difficulty(0.95),
299        );
300        president_bush.add_entity(
301            ConfusableEntity::new("Q23505", ConfusableReason::TemporalAmbiguity)
302                .with_label("George H. W. Bush")
303                .with_description("41st President of the United States")
304                .with_entity_type("human")
305                .with_difficulty(0.95),
306        );
307        self.add(president_bush);
308
309        // "Michael Jackson" - different famous people
310        let mut michael_jackson = ConfusableSet::new("Michael Jackson");
311        michael_jackson.add_entity(
312            ConfusableEntity::new("Q2831", ConfusableReason::DisambiguationPage)
313                .with_label("Michael Jackson")
314                .with_description("American singer-songwriter (1958-2009)")
315                .with_entity_type("human")
316                .with_difficulty(0.7),
317        );
318        michael_jackson.add_entity(
319            ConfusableEntity::new("Q318029", ConfusableReason::DisambiguationPage)
320                .with_label("Michael Jackson")
321                .with_description("English beer and whisky writer")
322                .with_entity_type("human")
323                .with_difficulty(0.6),
324        );
325        self.add(michael_jackson);
326
327        // "Apple" - company vs fruit vs records
328        let mut apple = ConfusableSet::new("Apple");
329        apple.add_entity(
330            ConfusableEntity::new("Q312", ConfusableReason::NameCollision)
331                .with_label("Apple Inc.")
332                .with_description("American technology company")
333                .with_entity_type("organization")
334                .with_difficulty(0.7),
335        );
336        apple.add_entity(
337            ConfusableEntity::new("Q213710", ConfusableReason::NameCollision)
338                .with_label("Apple Records")
339                .with_description("Record label founded by the Beatles")
340                .with_entity_type("organization")
341                .with_difficulty(0.6),
342        );
343        self.add(apple);
344
345        // "Paris" - city vs person vs mythological figure
346        let mut paris = ConfusableSet::new("Paris");
347        paris.add_entity(
348            ConfusableEntity::new("Q90", ConfusableReason::DisambiguationPage)
349                .with_label("Paris")
350                .with_description("Capital city of France")
351                .with_entity_type("city")
352                .with_difficulty(0.5),
353        );
354        paris.add_entity(
355            ConfusableEntity::new("Q167646", ConfusableReason::DisambiguationPage)
356                .with_label("Paris Hilton")
357                .with_description("American socialite and businesswoman")
358                .with_entity_type("human")
359                .with_difficulty(0.4),
360        );
361        paris.add_entity(
362            ConfusableEntity::new("Q167491", ConfusableReason::FictionalVsReal)
363                .with_label("Paris")
364                .with_description("Trojan prince in Greek mythology")
365                .with_entity_type("mythological figure")
366                .with_difficulty(0.6),
367        );
368        self.add(paris);
369
370        // Transliteration variants
371        let mut beijing = ConfusableSet::new("Beijing");
372        beijing.add_entity(
373            ConfusableEntity::new("Q956", ConfusableReason::TransliterationVariant)
374                .with_label("Beijing")
375                .with_description("Capital of China (modern transliteration)")
376                .with_entity_type("city")
377                .with_difficulty(0.3),
378        );
379        self.add(beijing);
380
381        let mut peking = ConfusableSet::new("Peking");
382        peking.add_entity(
383            ConfusableEntity::new("Q956", ConfusableReason::TransliterationVariant)
384                .with_label("Beijing")
385                .with_description("Capital of China (historical transliteration)")
386                .with_entity_type("city")
387                .with_difficulty(0.3),
388        );
389        self.add(peking);
390
391        // Historical OCR-style confusables (common in digitized newspapers)
392        let mut washington = ConfusableSet::new("Washington");
393        washington.add_entity(
394            ConfusableEntity::new("Q23", ConfusableReason::DisambiguationPage)
395                .with_label("George Washington")
396                .with_description("1st President of the United States")
397                .with_entity_type("human")
398                .with_difficulty(0.7),
399        );
400        washington.add_entity(
401            ConfusableEntity::new("Q61", ConfusableReason::DisambiguationPage)
402                .with_label("Washington, D.C.")
403                .with_description("Capital of the United States")
404                .with_entity_type("city")
405                .with_difficulty(0.6),
406        );
407        washington.add_entity(
408            ConfusableEntity::new("Q1223", ConfusableReason::DisambiguationPage)
409                .with_label("Washington")
410                .with_description("State in the Pacific Northwest")
411                .with_entity_type("administrative region")
412                .with_difficulty(0.5),
413        );
414        self.add(washington);
415
416        self
417    }
418
419    /// Generate all training pairs for contrastive learning.
420    ///
421    /// For each confusable set, generates pairs where one entity is positive
422    /// and all others are negatives.
423    pub fn generate_all_training_pairs(&self) -> Vec<TrainingPair> {
424        let mut pairs = Vec::new();
425        for set in self.entries.values() {
426            for positive in &set.entities {
427                pairs.extend(set.to_training_pairs(&positive.kb_id));
428            }
429        }
430        pairs
431    }
432
433    /// Filter to entities of a specific type.
434    pub fn filter_by_type(&self, entity_type: &str) -> Vec<&ConfusableEntity> {
435        self.entries
436            .values()
437            .flat_map(|set| &set.entities)
438            .filter(|e| {
439                e.entity_type
440                    .as_ref()
441                    .is_some_and(|t| t.to_lowercase() == entity_type.to_lowercase())
442            })
443            .collect()
444    }
445}
446
447/// OCR error patterns common in historical documents.
448///
449/// Based on the observation from Arora et al. that historical documents
450/// "are replete with individuals not remembered in contemporary knowledgebases"
451/// and suffer from OCR noise.
452#[derive(Debug, Clone, Default)]
453pub struct OcrConfusables {
454    /// Character substitution patterns
455    substitutions: HashMap<char, Vec<char>>,
456}
457
458impl OcrConfusables {
459    /// Create with common OCR error patterns.
460    pub fn new() -> Self {
461        let mut subs = HashMap::new();
462
463        // Common OCR confusions
464        subs.insert('m', vec!['r', 'n']); // "m" → "rn"
465        subs.insert('l', vec!['1', 'I', '|']);
466        subs.insert('O', vec!['0', 'Q']);
467        subs.insert('I', vec!['l', '1', '|']);
468        subs.insert('S', vec!['5', '$']);
469        subs.insert('B', vec!['8', '3']);
470        subs.insert('G', vec!['6', 'C']);
471        subs.insert('Z', vec!['2']);
472        subs.insert('o', vec!['0', 'c']);
473        subs.insert('c', vec!['o', 'e']);
474        subs.insert('e', vec!['c', 'o']);
475        subs.insert('h', vec!['b', 'n']);
476        subs.insert('u', vec!['v', 'n']);
477        subs.insert('v', vec!['u', 'w']);
478        subs.insert('w', ["vv", "uu"].iter().flat_map(|s| s.chars()).collect());
479
480        Self {
481            substitutions: subs,
482        }
483    }
484
485    /// Generate OCR variants of a string.
486    ///
487    /// Returns possible OCR-corrupted versions of the input.
488    pub fn generate_variants(&self, text: &str) -> Vec<String> {
489        let mut variants = Vec::new();
490        let chars: Vec<char> = text.chars().collect();
491
492        // Single character substitutions
493        for (i, c) in chars.iter().enumerate() {
494            if let Some(subs) = self.substitutions.get(c) {
495                for sub in subs {
496                    let mut variant: String = chars[..i].iter().collect();
497                    variant.push(*sub);
498                    variant.extend(&chars[i + 1..]);
499                    variants.push(variant);
500                }
501            }
502        }
503
504        // Common pattern: "rn" → "m"
505        let text_lower = text.to_lowercase();
506        if text_lower.contains("rn") {
507            variants.push(text.replace("rn", "m"));
508        }
509        if text_lower.contains("m") {
510            variants.push(text.replace('m', "rn"));
511        }
512
513        variants
514    }
515
516    /// Check if two strings might be OCR variants of each other.
517    pub fn might_be_variants(&self, a: &str, b: &str) -> bool {
518        let a_variants = self.generate_variants(a);
519        if a_variants.iter().any(|v| v.eq_ignore_ascii_case(b)) {
520            return true;
521        }
522        let b_variants = self.generate_variants(b);
523        b_variants.iter().any(|v| v.eq_ignore_ascii_case(a))
524    }
525}
526
527#[cfg(test)]
528mod tests {
529    use super::*;
530
531    #[test]
532    fn test_confusable_set() {
533        let mut set = ConfusableSet::new("John Smith");
534        set.add("Q1", ConfusableReason::DisambiguationPage, Some("Explorer"));
535        set.add(
536            "Q2",
537            ConfusableReason::DisambiguationPage,
538            Some("Politician"),
539        );
540
541        assert_eq!(set.len(), 2);
542
543        let pairs = set.to_training_pairs("Q1");
544        assert_eq!(pairs.len(), 1);
545        assert_eq!(pairs[0].negative_kb_id, "Q2");
546    }
547
548    #[test]
549    fn test_registry() {
550        let registry = ConfusableRegistry::new().with_well_known();
551
552        // Check George Bush disambiguation
553        let bush = registry.get("george bush");
554        assert!(bush.is_some());
555        let bush = bush.unwrap();
556        assert!(bush.len() >= 2);
557
558        // Check training pairs
559        let pairs = registry.generate_all_training_pairs();
560        assert!(!pairs.is_empty());
561    }
562
563    #[test]
564    fn test_temporal_ambiguity() {
565        let registry = ConfusableRegistry::new().with_well_known();
566
567        let bush = registry.get("president bush").unwrap();
568        let temporal = bush.filter_by_reason(&ConfusableReason::TemporalAmbiguity);
569        assert!(temporal.len() >= 2);
570    }
571
572    #[test]
573    fn test_ocr_confusables() {
574        let ocr = OcrConfusables::new();
575
576        // "rn" ↔ "m" is a classic OCR confusion
577        let variants = ocr.generate_variants("Gnome");
578        assert!(variants.contains(&"Gnorne".to_string()));
579
580        assert!(ocr.might_be_variants("Gnome", "Gnorne"));
581    }
582
583    #[test]
584    fn test_transliteration() {
585        let registry = ConfusableRegistry::new().with_well_known();
586
587        // Beijing and Peking should both map to Q956
588        let beijing = registry.get("beijing");
589        let peking = registry.get("peking");
590
591        assert!(beijing.is_some());
592        assert!(peking.is_some());
593
594        let beijing_ids: Vec<_> = beijing.unwrap().entities.iter().map(|e| &e.kb_id).collect();
595        let peking_ids: Vec<_> = peking.unwrap().entities.iter().map(|e| &e.kb_id).collect();
596
597        assert!(beijing_ids.contains(&&"Q956".to_string()));
598        assert!(peking_ids.contains(&&"Q956".to_string()));
599    }
600}