Skip to main content

anno/linking/
nil.rs

1//! NIL Detection for Entity Linking.
2//!
3//! Identifies mentions that cannot be linked to any KB entry.
4//!
5//! Based on insights from "Contrastive Entity Coreference and Disambiguation for
6//! Historical Texts" (Arora et al. 2024):
7//!
8//! > Historical documents are replete with individuals not remembered in
9//! > contemporary knowledgebases. [...] We use a threshold on the cosine
10//! > similarity to the closest entity in the knowledgebase to identify
11//! > out-of-knowledgebase individuals.
12//!
13//! # NIL Reasons
14//!
15//! - **No candidates**: Candidate generator found nothing
16//! - **Low confidence**: Best candidate score below threshold
17//! - **Type mismatch**: NER type incompatible with all candidates
18//! - **Emerging entity**: Entity exists but not yet in KB
19//! - **Out-of-KB**: Entity unlikely to be in any KB (historical/local figure)
20//!
21//! # Design
22//!
23//! NIL detection uses multiple signals:
24//! 1. Score distribution analysis (primary)
25//! 2. Margin between top candidates (uncertainty measure)
26//! 3. Out-of-KB confidence threshold (embedding-based)
27//! 4. Coverage heuristics (mention characteristics)
28//! 5. Learned classifier (optional)
29
30use serde::{Deserialize, Serialize};
31
32use super::candidate::Candidate;
33
34/// Reason for NIL classification.
35#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
36pub enum NilReason {
37    /// No candidates were generated
38    NoCandidates,
39    /// Top score is below the confidence threshold
40    LowConfidence,
41    /// All candidates have incompatible types
42    TypeMismatch,
43    /// Mention appears to be noise (too short, numeric, etc.)
44    NoisyMention,
45    /// Large margin between scores suggests uncertainty
46    LargeMargin,
47    /// Explicit NIL (manually marked as unlinkable)
48    ExplicitNil,
49    /// Out-of-knowledgebase entity (embedding similarity too low)
50    ///
51    /// This is especially common in historical documents where many
52    /// individuals are not remembered in contemporary KBs like Wikipedia.
53    OutOfKnowledgebase,
54    /// Emerging/recent entity not yet in KB
55    EmergingEntity,
56}
57
58impl std::fmt::Display for NilReason {
59    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60        match self {
61            Self::NoCandidates => write!(f, "no_candidates"),
62            Self::LowConfidence => write!(f, "low_confidence"),
63            Self::TypeMismatch => write!(f, "type_mismatch"),
64            Self::NoisyMention => write!(f, "noisy_mention"),
65            Self::LargeMargin => write!(f, "large_margin"),
66            Self::ExplicitNil => write!(f, "explicit_nil"),
67            Self::OutOfKnowledgebase => write!(f, "out_of_kb"),
68            Self::EmergingEntity => write!(f, "emerging_entity"),
69        }
70    }
71}
72
73/// NIL detector for entity linking.
74///
75/// Implements threshold-based out-of-KB detection following Arora et al. (2024):
76/// > We use a threshold on the cosine similarity to the closest entity in the
77/// > knowledgebase to identify out-of-knowledgebase individuals.
78#[derive(Debug, Clone)]
79pub struct NilDetector {
80    /// Minimum score for a valid link
81    score_threshold: f64,
82    /// Maximum margin between top-2 candidates (for uncertainty)
83    margin_threshold: f64,
84    /// Minimum mention length
85    min_mention_length: usize,
86    /// Minimum candidates required to link
87    min_candidates: usize,
88    /// Out-of-KB threshold for embedding similarity.
89    ///
90    /// If the best candidate's embedding similarity is below this threshold,
91    /// the entity is classified as out-of-knowledgebase. This is critical
92    /// for historical documents where many individuals never made it to
93    /// Wikipedia/Wikidata.
94    ///
95    /// From Arora et al.: typical values are 0.5-0.7 for bi-encoder similarity.
96    out_of_kb_threshold: f64,
97    /// Whether to prefer creating new entities over skipping
98    ///
99    /// When true, out-of-KB entities are flagged for entity creation rather
100    /// than skipping. This is useful for building local entity registries
101    /// from historical documents.
102    prefer_create_over_skip: bool,
103}
104
105impl Default for NilDetector {
106    fn default() -> Self {
107        Self {
108            score_threshold: 0.3,
109            margin_threshold: 0.8,
110            min_mention_length: 2,
111            min_candidates: 1,
112            out_of_kb_threshold: 0.5, // Conservative default
113            prefer_create_over_skip: false,
114        }
115    }
116}
117
118impl NilDetector {
119    /// Create a new NIL detector.
120    pub fn new() -> Self {
121        Self::default()
122    }
123
124    /// Set score threshold.
125    pub fn with_score_threshold(mut self, threshold: f64) -> Self {
126        self.score_threshold = threshold;
127        self
128    }
129
130    /// Set margin threshold.
131    pub fn with_margin_threshold(mut self, threshold: f64) -> Self {
132        self.margin_threshold = threshold;
133        self
134    }
135
136    /// Set out-of-KB threshold for embedding similarity.
137    ///
138    /// If the best candidate's embedding similarity is below this threshold,
139    /// the entity is classified as out-of-knowledgebase.
140    ///
141    /// From Arora et al. (2024): typical values are 0.5-0.7 for bi-encoder similarity.
142    pub fn with_out_of_kb_threshold(mut self, threshold: f64) -> Self {
143        self.out_of_kb_threshold = threshold;
144        self
145    }
146
147    /// Set whether to prefer creating new entities over skipping.
148    ///
149    /// When true, out-of-KB entities are flagged for entity creation rather
150    /// than skipping. Useful for building local entity registries.
151    pub fn with_prefer_create(mut self, prefer: bool) -> Self {
152        self.prefer_create_over_skip = prefer;
153        self
154    }
155
156    /// Check if a mention should be classified as NIL.
157    ///
158    /// Returns `Some(NilReason)` if NIL, `None` if linkable.
159    pub fn check_nil(
160        &self,
161        mention: &str,
162        candidates: &[Candidate],
163        ner_type: Option<&str>,
164    ) -> Option<NilReason> {
165        // Check for noisy mention
166        if self.is_noisy_mention(mention) {
167            return Some(NilReason::NoisyMention);
168        }
169
170        // Check for no candidates
171        if candidates.len() < self.min_candidates {
172            return Some(NilReason::NoCandidates);
173        }
174
175        // Check type mismatch
176        if let Some(ner_t) = ner_type {
177            let has_compatible = candidates.iter().any(|c| {
178                c.kb_type
179                    .as_ref()
180                    .map(|kt| super::candidate::type_compatibility(Some(ner_t), Some(kt)) > 0.5)
181                    .unwrap_or(true) // No type info = assume compatible
182            });
183            if !has_compatible {
184                return Some(NilReason::TypeMismatch);
185            }
186        }
187
188        // Get top candidate score
189        let top_score = candidates
190            .iter()
191            .map(|c| c.score)
192            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
193            .unwrap_or(0.0);
194
195        // Check low confidence
196        if top_score < self.score_threshold {
197            return Some(NilReason::LowConfidence);
198        }
199
200        // Check margin (if multiple candidates)
201        if candidates.len() >= 2 {
202            let mut scores: Vec<f64> = candidates.iter().map(|c| c.score).collect();
203            scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
204
205            let margin = scores[0] - scores[1];
206            // If margin is too small (close competition), might be uncertain
207            if margin < 0.1 && top_score < 0.6 {
208                // Only flag if top score isn't very high
209                return Some(NilReason::LargeMargin);
210            }
211        }
212
213        None // Linkable
214    }
215
216    /// Check if a mention is likely noise.
217    fn is_noisy_mention(&self, mention: &str) -> bool {
218        let trimmed = mention.trim();
219
220        // Too short
221        if trimmed.len() < self.min_mention_length {
222            return true;
223        }
224
225        // Pure numeric
226        if trimmed.chars().all(|c| c.is_numeric() || c.is_whitespace()) {
227            return true;
228        }
229
230        // Pure punctuation
231        if trimmed
232            .chars()
233            .all(|c| c.is_ascii_punctuation() || c.is_whitespace())
234        {
235            return true;
236        }
237
238        // Single character (unless CJK)
239        if trimmed.chars().count() == 1 && !trimmed.chars().next().map(is_cjk).unwrap_or(false) {
240            return true;
241        }
242
243        false
244    }
245}
246
247/// Check if a character is CJK.
248fn is_cjk(c: char) -> bool {
249    matches!(c as u32,
250        0x4E00..=0x9FFF |   // CJK Unified Ideographs
251        0x3400..=0x4DBF |   // CJK Unified Ideographs Extension A
252        0x20000..=0x2A6DF | // CJK Unified Ideographs Extension B
253        0xF900..=0xFAFF |   // CJK Compatibility Ideographs
254        0x2F800..=0x2FA1F   // CJK Compatibility Ideographs Supplement
255    )
256}
257
258/// Extended candidate with embedding similarity.
259///
260/// Used for out-of-KB detection when embeddings are available.
261#[derive(Debug, Clone)]
262pub struct CandidateWithEmbedding<'a> {
263    /// Reference to the base candidate
264    pub candidate: &'a Candidate,
265    /// Embedding similarity (cosine) between mention and candidate
266    pub embedding_similarity: f64,
267}
268
269impl NilDetector {
270    /// Check for out-of-KB entity using embedding similarity.
271    ///
272    /// This is the core insight from Arora et al. (2024):
273    /// > We use a threshold on the cosine similarity to the closest entity
274    /// > in the knowledgebase to identify out-of-knowledgebase individuals.
275    ///
276    /// Returns `Some(OutOfKnowledgebase)` if the best embedding similarity
277    /// is below the threshold, indicating the entity is likely not in any KB.
278    pub fn check_out_of_kb(&self, candidates: &[CandidateWithEmbedding]) -> Option<NilReason> {
279        if candidates.is_empty() {
280            return None; // Will be caught by NoCandidates check
281        }
282
283        let best_similarity = candidates
284            .iter()
285            .map(|c| c.embedding_similarity)
286            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
287            .unwrap_or(0.0);
288
289        if best_similarity < self.out_of_kb_threshold {
290            Some(NilReason::OutOfKnowledgebase)
291        } else {
292            None
293        }
294    }
295
296    /// Full NIL check with embedding similarity.
297    ///
298    /// This combines standard candidate-based NIL detection with
299    /// embedding-based out-of-KB detection.
300    pub fn check_nil_with_embeddings(
301        &self,
302        mention: &str,
303        candidates: &[CandidateWithEmbedding],
304        ner_type: Option<&str>,
305    ) -> Option<NilReason> {
306        // First, check noisy mention (doesn't need candidates)
307        if self.is_noisy_mention(mention) {
308            return Some(NilReason::NoisyMention);
309        }
310
311        // Check for no candidates
312        if candidates.is_empty() {
313            return Some(NilReason::NoCandidates);
314        }
315
316        // Check out-of-KB using embedding threshold
317        // This is the key insight from Arora et al.
318        if let Some(reason) = self.check_out_of_kb(candidates) {
319            return Some(reason);
320        }
321
322        // Extract base candidates for remaining checks
323        let base_candidates: Vec<&Candidate> = candidates.iter().map(|c| c.candidate).collect();
324
325        // Check type mismatch
326        if let Some(ner_t) = ner_type {
327            let has_compatible = base_candidates.iter().any(|c| {
328                c.kb_type
329                    .as_ref()
330                    .map(|kt| super::candidate::type_compatibility(Some(ner_t), Some(kt)) > 0.5)
331                    .unwrap_or(true)
332            });
333            if !has_compatible {
334                return Some(NilReason::TypeMismatch);
335            }
336        }
337
338        // Get top candidate score
339        let top_score = base_candidates
340            .iter()
341            .map(|c| c.score)
342            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
343            .unwrap_or(0.0);
344
345        // Check low confidence
346        if top_score < self.score_threshold {
347            return Some(NilReason::LowConfidence);
348        }
349
350        // Check margin
351        if base_candidates.len() >= 2 {
352            let mut scores: Vec<f64> = base_candidates.iter().map(|c| c.score).collect();
353            scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
354
355            let margin = scores[0] - scores[1];
356            if margin < 0.1 && top_score < 0.6 {
357                return Some(NilReason::LargeMargin);
358            }
359        }
360
361        None
362    }
363
364    /// Analyze with embedding-based out-of-KB detection.
365    ///
366    /// Returns full analysis including suggested action.
367    pub fn analyze_with_embeddings(
368        &self,
369        mention: &str,
370        candidates: &[CandidateWithEmbedding],
371        ner_type: Option<&str>,
372    ) -> NilAnalysis {
373        let nil_result = self.check_nil_with_embeddings(mention, candidates, ner_type);
374
375        match nil_result {
376            None => {
377                let best_sim = candidates
378                    .iter()
379                    .map(|c| c.embedding_similarity)
380                    .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
381                    .unwrap_or(0.0);
382
383                NilAnalysis {
384                    is_nil: false,
385                    reason: None,
386                    confidence: best_sim,
387                    action: NilAction::Link,
388                }
389            }
390            Some(reason) => {
391                let (confidence, action) = match &reason {
392                    NilReason::NoCandidates => {
393                        if is_likely_entity_name(mention) {
394                            (
395                                0.7,
396                                if self.prefer_create_over_skip {
397                                    NilAction::CreateEntry
398                                } else {
399                                    NilAction::Review
400                                },
401                            )
402                        } else {
403                            (0.9, NilAction::Skip)
404                        }
405                    }
406                    NilReason::OutOfKnowledgebase => {
407                        // Key case: entity exists but not in KB (common in historical docs)
408                        // High confidence this is a real entity, just not in Wikipedia
409                        if is_likely_entity_name(mention) {
410                            (
411                                0.8,
412                                if self.prefer_create_over_skip {
413                                    NilAction::CreateEntry
414                                } else {
415                                    NilAction::Review
416                                },
417                            )
418                        } else {
419                            (0.6, NilAction::Review)
420                        }
421                    }
422                    NilReason::EmergingEntity => (0.7, NilAction::CreateEntry),
423                    NilReason::LowConfidence => (0.6, NilAction::Review),
424                    NilReason::TypeMismatch => (0.8, NilAction::Review),
425                    NilReason::NoisyMention => (0.95, NilAction::Skip),
426                    NilReason::LargeMargin => (0.5, NilAction::Review),
427                    NilReason::ExplicitNil => (1.0, NilAction::Skip),
428                };
429
430                NilAnalysis {
431                    is_nil: true,
432                    reason: Some(reason),
433                    confidence,
434                    action,
435                }
436            }
437        }
438    }
439}
440
441/// Result of NIL analysis including calibrated score.
442#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct NilAnalysis {
444    /// Whether this is NIL
445    pub is_nil: bool,
446    /// Reason if NIL
447    pub reason: Option<NilReason>,
448    /// Confidence in the NIL decision (0-1)
449    pub confidence: f64,
450    /// Suggested action
451    pub action: NilAction,
452}
453
454/// Suggested action for NIL mentions.
455#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
456pub enum NilAction {
457    /// Link to best candidate (not NIL)
458    Link,
459    /// Skip this mention
460    Skip,
461    /// Flag for human review
462    Review,
463    /// Candidate for new KB entry
464    CreateEntry,
465}
466
467impl NilDetector {
468    /// Full NIL analysis with suggested action.
469    pub fn analyze(
470        &self,
471        mention: &str,
472        candidates: &[Candidate],
473        ner_type: Option<&str>,
474    ) -> NilAnalysis {
475        let nil_result = self.check_nil(mention, candidates, ner_type);
476
477        match nil_result {
478            None => {
479                // Linkable
480                let top_score = candidates
481                    .iter()
482                    .map(|c| c.score)
483                    .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
484                    .unwrap_or(0.0);
485
486                NilAnalysis {
487                    is_nil: false,
488                    reason: None,
489                    confidence: top_score,
490                    action: NilAction::Link,
491                }
492            }
493            Some(reason) => {
494                let (confidence, action) = match &reason {
495                    NilReason::NoCandidates => {
496                        // High confidence it's NIL, but might be new entity
497                        if is_likely_entity_name(mention) {
498                            (0.7, NilAction::CreateEntry)
499                        } else {
500                            (0.9, NilAction::Skip)
501                        }
502                    }
503                    NilReason::LowConfidence => (0.6, NilAction::Review),
504                    NilReason::TypeMismatch => (0.8, NilAction::Review),
505                    NilReason::NoisyMention => (0.95, NilAction::Skip),
506                    NilReason::LargeMargin => (0.5, NilAction::Review),
507                    NilReason::ExplicitNil => (1.0, NilAction::Skip),
508                    NilReason::OutOfKnowledgebase => (0.85, NilAction::CreateEntry),
509                    NilReason::EmergingEntity => (0.75, NilAction::CreateEntry),
510                };
511
512                NilAnalysis {
513                    is_nil: true,
514                    reason: Some(reason),
515                    confidence,
516                    action,
517                }
518            }
519        }
520    }
521}
522
523/// Heuristic check if mention looks like a proper entity name.
524fn is_likely_entity_name(mention: &str) -> bool {
525    let trimmed = mention.trim();
526
527    // Has uppercase start
528    let has_upper = trimmed
529        .chars()
530        .next()
531        .map(|c| c.is_uppercase())
532        .unwrap_or(false);
533
534    // Multiple words with capitals
535    let cap_words = trimmed
536        .split_whitespace()
537        .filter(|w| w.chars().next().map(|c| c.is_uppercase()).unwrap_or(false))
538        .count();
539
540    has_upper && cap_words >= 1
541}
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546
547    #[test]
548    fn test_nil_no_candidates() {
549        let detector = NilDetector::new();
550        let result = detector.check_nil("Unknown Entity", &[], None);
551        assert_eq!(result, Some(NilReason::NoCandidates));
552    }
553
554    #[test]
555    fn test_nil_noisy_mention() {
556        let detector = NilDetector::new();
557        assert_eq!(
558            detector.check_nil("123", &[], None),
559            Some(NilReason::NoisyMention)
560        );
561        assert_eq!(
562            detector.check_nil(".", &[], None),
563            Some(NilReason::NoisyMention)
564        );
565    }
566
567    #[test]
568    fn test_linkable() {
569        let detector = NilDetector::new();
570        let mut candidate = super::super::candidate::Candidate::new(
571            "Q937",
572            super::super::candidate::CandidateSource::Wikidata,
573            "Albert Einstein",
574        );
575        candidate.score = 0.8;
576
577        let result = detector.check_nil("Einstein", &[candidate], Some("PERSON"));
578        assert_eq!(result, None); // Linkable
579    }
580
581    #[test]
582    fn test_nil_analysis() {
583        let detector = NilDetector::new();
584        let analysis = detector.analyze("Unknown Entity", &[], None);
585
586        assert!(analysis.is_nil);
587        assert!(matches!(analysis.reason, Some(NilReason::NoCandidates)));
588    }
589
590    #[test]
591    fn test_is_cjk() {
592        assert!(is_cjk('中'));
593        assert!(is_cjk('日'));
594        assert!(!is_cjk('A'));
595    }
596
597    #[test]
598    fn test_out_of_kb_detection() {
599        let detector = NilDetector::new().with_out_of_kb_threshold(0.5);
600
601        // Create candidate with low embedding similarity (historical figure)
602        let mut candidate = super::super::candidate::Candidate::new(
603            "Q12345",
604            super::super::candidate::CandidateSource::Wikidata,
605            "John Smith",
606        );
607        candidate.score = 0.6;
608
609        let candidates_with_embeddings = vec![CandidateWithEmbedding {
610            candidate: &candidate,
611            embedding_similarity: 0.3, // Below threshold
612        }];
613
614        let result = detector.check_out_of_kb(&candidates_with_embeddings);
615        assert_eq!(result, Some(NilReason::OutOfKnowledgebase));
616    }
617
618    #[test]
619    fn test_out_of_kb_above_threshold() {
620        let detector = NilDetector::new().with_out_of_kb_threshold(0.5);
621
622        let mut candidate = super::super::candidate::Candidate::new(
623            "Q937",
624            super::super::candidate::CandidateSource::Wikidata,
625            "Albert Einstein",
626        );
627        candidate.score = 0.9;
628
629        let candidates_with_embeddings = vec![CandidateWithEmbedding {
630            candidate: &candidate,
631            embedding_similarity: 0.85, // Above threshold
632        }];
633
634        let result = detector.check_out_of_kb(&candidates_with_embeddings);
635        assert_eq!(result, None); // Not out-of-KB
636    }
637
638    #[test]
639    fn test_prefer_create_over_skip() {
640        let detector = NilDetector::new()
641            .with_out_of_kb_threshold(0.5)
642            .with_prefer_create(true);
643
644        // Historical figure not in KB
645        let mut candidate = super::super::candidate::Candidate::new(
646            "Q99999",
647            super::super::candidate::CandidateSource::Wikidata,
648            "Unknown Person",
649        );
650        candidate.score = 0.4;
651
652        let candidates = vec![CandidateWithEmbedding {
653            candidate: &candidate,
654            embedding_similarity: 0.3,
655        }];
656
657        let analysis = detector.analyze_with_embeddings(
658            "Mayor Thomas Jenkins", // Looks like entity name
659            &candidates,
660            Some("PERSON"),
661        );
662
663        assert!(analysis.is_nil);
664        assert_eq!(analysis.reason, Some(NilReason::OutOfKnowledgebase));
665        assert_eq!(analysis.action, NilAction::CreateEntry);
666    }
667
668    #[test]
669    fn test_nil_reason_display() {
670        assert_eq!(NilReason::OutOfKnowledgebase.to_string(), "out_of_kb");
671        assert_eq!(NilReason::EmergingEntity.to_string(), "emerging_entity");
672    }
673}