Skip to main content

anno/linking/
candidate.rs

1//! Candidate generation for entity linking.
2//!
3//! High-recall retrieval of potential KB entries for a mention.
4//!
5//! # Similarity Metrics
6//!
7//! The candidate generator supports multiple string similarity metrics
8//! for fuzzy matching. The default is Jaccard (word + trigram), but
9//! edit distance variants are useful for:
10//!
11//! - **Typo correction**: Damerau-Levenshtein handles transpositions
12//! - **OCR/damaged text**: Wildcard edit distance matches partial patterns
13//! - **Cross-script matching**: Edit distance works character-by-character
14//!
15//! # Research Context
16//!
17//! Edit distance with wildcards is particularly important for ancient/historical
18//! text processing where inscriptions may be damaged or illegible.
19//! See Tamburini (2025) on decipherment of ancient scripts.
20
21use crate::edit_distance;
22use serde::{Deserialize, Serialize};
23use std::collections::HashMap;
24
25/// Source of a candidate.
26#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
27pub enum CandidateSource {
28    /// Wikidata - Most comprehensive, actively maintained
29    #[default]
30    Wikidata,
31    /// YAGO - Wikipedia + WordNet + GeoNames ontology
32    YAGO,
33    /// DBpedia - Wikipedia infobox extraction
34    DBpedia,
35    /// Wikipedia - Direct article links
36    Wikipedia,
37    /// Freebase - Legacy (deprecated 2016, mapped to Wikidata)
38    Freebase,
39    /// UMLS - Unified Medical Language System (biomedical)
40    UMLS,
41    /// GeoNames - Geographic entities
42    GeoNames,
43    /// Custom knowledge base
44    Custom(String),
45}
46
47// =============================================================================
48// Similarity Metrics
49// =============================================================================
50
51/// Similarity metric for candidate matching.
52///
53/// Different metrics are suited to different use cases:
54///
55/// | Metric | Best For | Speed |
56/// |--------|----------|-------|
57/// | Jaccard | General text, multi-word entities | Fast |
58/// | EditDistance | Typo correction, single words | Medium |
59/// | DamerauLevenshtein | Keyboard typos (transpositions) | Medium |
60/// | EditDistanceWildcard | OCR/damaged text | Slower |
61///
62/// # Research Context
63///
64/// The wildcard edit distance (Tamburini 2025) is particularly useful for
65/// computational philology and ancient language processing where texts
66/// may have illegible portions marked with `?` (single char) or `*` (multi char).
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
68pub enum SimilarityMetric {
69    /// Jaccard similarity on words + character trigrams (default).
70    ///
71    /// Best for general text with multi-word entity names.
72    /// Fast but doesn't handle character-level edits well.
73    #[default]
74    Jaccard,
75
76    /// Normalized Levenshtein edit distance.
77    ///
78    /// Counts insertions, deletions, substitutions.
79    /// Better for single-word matching and typo detection.
80    EditDistance,
81
82    /// Damerau-Levenshtein distance.
83    ///
84    /// Like edit distance but counts adjacent transpositions
85    /// (e.g., "teh" → "the") as single edits.
86    /// Better for keyboard typo correction.
87    DamerauLevenshtein,
88
89    /// Edit distance with wildcards.
90    ///
91    /// Supports `?` (match one char) and `*` (match zero or more).
92    /// Essential for damaged/OCR'd historical text.
93    ///
94    /// **Note**: Wildcards only work in the **mention** (query), not the candidate.
95    EditDistanceWildcard,
96}
97
98impl SimilarityMetric {
99    /// Compute similarity score between two strings using this metric.
100    ///
101    /// Returns a value in [0.0, 1.0] where 1.0 = identical.
102    #[must_use]
103    pub fn compute(&self, a: &str, b: &str) -> f64 {
104        match self {
105            SimilarityMetric::Jaccard => string_similarity(a, b),
106            SimilarityMetric::EditDistance => edit_distance::edit_similarity(a, b),
107            SimilarityMetric::DamerauLevenshtein => {
108                // Normalize Damerau-Levenshtein to [0, 1] similarity
109                let dist = edit_distance::damerau_levenshtein(a, b);
110                let max_len = a.chars().count().max(b.chars().count());
111                if max_len == 0 {
112                    1.0
113                } else {
114                    1.0 - (dist as f64 / max_len as f64)
115                }
116            }
117            SimilarityMetric::EditDistanceWildcard => {
118                edit_distance::edit_similarity_wildcards(a, b)
119            }
120        }
121    }
122
123    /// Get a human-readable name for this metric.
124    #[must_use]
125    pub fn name(&self) -> &'static str {
126        match self {
127            SimilarityMetric::Jaccard => "jaccard",
128            SimilarityMetric::EditDistance => "edit-distance",
129            SimilarityMetric::DamerauLevenshtein => "damerau-levenshtein",
130            SimilarityMetric::EditDistanceWildcard => "edit-distance-wildcard",
131        }
132    }
133
134    /// Parse from string (for CLI).
135    ///
136    /// Note: This is not the standard `FromStr::from_str` trait method.
137    pub fn parse_str(s: &str) -> Option<Self> {
138        match s.to_lowercase().as_str() {
139            "jaccard" | "jac" => Some(SimilarityMetric::Jaccard),
140            "edit-distance" | "edit" | "levenshtein" | "lev" => {
141                Some(SimilarityMetric::EditDistance)
142            }
143            "damerau-levenshtein" | "damerau" | "dl" => Some(SimilarityMetric::DamerauLevenshtein),
144            "edit-distance-wildcard" | "wildcard" | "edw" => {
145                Some(SimilarityMetric::EditDistanceWildcard)
146            }
147            _ => None,
148        }
149    }
150}
151
152/// A candidate KB entry for a mention.
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct Candidate {
155    /// KB identifier (e.g., "Q937" for Wikidata)
156    pub kb_id: String,
157    /// Source knowledge base
158    pub source: CandidateSource,
159    /// Canonical name/label
160    pub label: String,
161    /// Aliases/alternate names
162    pub aliases: Vec<String>,
163    /// Description/gloss
164    pub description: Option<String>,
165    /// Entity type from KB (e.g., "human", "organization")
166    pub kb_type: Option<String>,
167    /// Wikipedia sitelink count (popularity proxy)
168    pub sitelinks: Option<u32>,
169    /// Prior probability (if known)
170    pub prior: f64,
171    /// String similarity to mention
172    pub string_sim: f64,
173    /// Type compatibility score
174    pub type_score: f64,
175    /// Overall candidate score (for ranking)
176    pub score: f64,
177    /// Temporal validity start (ISO 8601 date string).
178    ///
179    /// For people: birth date. For organizations: founding date.
180    /// Critical for historical document disambiguation where
181    /// "President Bush" could refer to different people depending
182    /// on the document date (Arora et al. 2024).
183    pub valid_from: Option<String>,
184    /// Temporal validity end (ISO 8601 date string).
185    ///
186    /// For people: death date. For organizations: dissolution date.
187    pub valid_until: Option<String>,
188}
189
190impl Candidate {
191    /// Create a new candidate.
192    pub fn new(kb_id: &str, source: CandidateSource, label: &str) -> Self {
193        Self {
194            kb_id: kb_id.to_string(),
195            source,
196            label: label.to_string(),
197            aliases: Vec::new(),
198            description: None,
199            kb_type: None,
200            sitelinks: None,
201            prior: 0.0,
202            string_sim: 0.0,
203            type_score: 1.0,
204            score: 0.0,
205            valid_from: None,
206            valid_until: None,
207        }
208    }
209
210    /// Set temporal validity start.
211    pub fn with_valid_from(mut self, date: &str) -> Self {
212        self.valid_from = Some(date.to_string());
213        self
214    }
215
216    /// Set temporal validity end.
217    pub fn with_valid_until(mut self, date: &str) -> Self {
218        self.valid_until = Some(date.to_string());
219        self
220    }
221
222    /// Add an alias.
223    pub fn with_alias(mut self, alias: &str) -> Self {
224        self.aliases.push(alias.to_string());
225        self
226    }
227
228    /// Set description.
229    pub fn with_description(mut self, desc: &str) -> Self {
230        self.description = Some(desc.to_string());
231        self
232    }
233
234    /// Set KB type.
235    pub fn with_kb_type(mut self, kb_type: &str) -> Self {
236        self.kb_type = Some(kb_type.to_string());
237        self
238    }
239
240    /// Set prior.
241    pub fn with_prior(mut self, prior: f64) -> Self {
242        self.prior = prior;
243        self
244    }
245
246    /// Compute overall score.
247    pub fn compute_score(&mut self) {
248        // Weighted combination of signals
249        self.score = 0.4 * self.string_sim
250            + 0.3 * self.prior
251            + 0.2 * self.type_score
252            + 0.1
253                * self
254                    .sitelinks
255                    .map(|s| (s as f64).log10() / 7.0)
256                    .unwrap_or(0.0);
257    }
258
259    /// Compute score with temporal context.
260    ///
261    /// For historical documents, temporal compatibility is critical for
262    /// disambiguation. "President Bush" in 1990 refers to George H.W. Bush,
263    /// while in 2005 it refers to George W. Bush.
264    ///
265    /// # Arguments
266    /// * `document_date` - ISO 8601 date string (e.g., "1990-01-15")
267    pub fn compute_score_with_temporal(&mut self, document_date: Option<&str>) {
268        // Base score
269        self.compute_score();
270
271        // Apply temporal penalty if document date is known
272        if let Some(doc_date) = document_date {
273            let temporal_score = self.temporal_compatibility(doc_date);
274            // Temporal compatibility can reduce score by up to 50%
275            self.score *= 0.5 + 0.5 * temporal_score;
276        }
277    }
278
279    /// Check temporal compatibility with a document date.
280    ///
281    /// Returns 1.0 if the candidate is valid at the document date,
282    /// 0.0 if clearly invalid, and intermediate values for uncertainty.
283    pub fn temporal_compatibility(&self, document_date: &str) -> f64 {
284        // Parse document date (just year for simplicity)
285        let doc_year = parse_year(document_date);
286
287        // Check if entity was "active" at document date
288        let from_year = self.valid_from.as_deref().and_then(parse_year);
289        let until_year = self.valid_until.as_deref().and_then(parse_year);
290
291        match (from_year, until_year, doc_year) {
292            // Can't determine temporal compatibility
293            (None, None, _) | (_, _, None) => 1.0,
294
295            // Entity not yet "born" at document date
296            (Some(from), _, Some(doc)) if doc < from => {
297                // Graduated penalty: 10 years before birth = 0.5
298                let years_before = from - doc;
299                (1.0 - years_before as f64 / 20.0).max(0.1)
300            }
301
302            // Entity "dead" before document date
303            (_, Some(until), Some(doc)) if doc > until => {
304                // Graduated penalty: 10 years after death = 0.5
305                let years_after = doc - until;
306                (1.0 - years_after as f64 / 20.0).max(0.1)
307            }
308
309            // Entity active at document date
310            _ => 1.0,
311        }
312    }
313
314    /// Get IRI/URI for this candidate.
315    pub fn to_iri(&self) -> String {
316        match &self.source {
317            CandidateSource::Wikidata => {
318                format!("http://www.wikidata.org/entity/{}", self.kb_id)
319            }
320            CandidateSource::YAGO => {
321                format!("http://yago-knowledge.org/resource/{}", self.kb_id)
322            }
323            CandidateSource::DBpedia => {
324                format!("http://dbpedia.org/resource/{}", self.kb_id)
325            }
326            CandidateSource::Wikipedia => {
327                format!("https://en.wikipedia.org/wiki/{}", self.kb_id)
328            }
329            CandidateSource::Freebase => {
330                format!("http://rdf.freebase.com/ns/{}", self.kb_id)
331            }
332            CandidateSource::UMLS => {
333                format!("https://uts.nlm.nih.gov/uts/umls/concept/{}", self.kb_id)
334            }
335            CandidateSource::GeoNames => {
336                format!("https://sws.geonames.org/{}/", self.kb_id)
337            }
338            CandidateSource::Custom(name) => {
339                format!("{}:{}", name, self.kb_id)
340            }
341        }
342    }
343
344    /// Get CURIE (Compact URI) for this candidate.
345    pub fn to_curie(&self) -> String {
346        let prefix = match &self.source {
347            CandidateSource::Wikidata => "wd",
348            CandidateSource::YAGO => "yago",
349            CandidateSource::DBpedia => "dbr",
350            CandidateSource::Wikipedia => "wp",
351            CandidateSource::Freebase => "fb",
352            CandidateSource::UMLS => "umls",
353            CandidateSource::GeoNames => "gn",
354            CandidateSource::Custom(name) => name,
355        };
356        format!("{}:{}", prefix, self.kb_id)
357    }
358}
359
360/// Trait for candidate generators.
361pub trait CandidateGenerator: Send + Sync {
362    /// Generate candidates for a mention.
363    ///
364    /// # Parameters
365    /// - `mention`: The mention text
366    /// - `context`: Surrounding text context
367    /// - `entity_type`: Optional NER type constraint
368    /// - `limit`: Maximum candidates to return
369    fn generate(
370        &self,
371        mention: &str,
372        context: &str,
373        entity_type: Option<&str>,
374        limit: usize,
375    ) -> Vec<Candidate>;
376
377    /// Name of this generator.
378    fn name(&self) -> &'static str;
379}
380
381/// In-memory candidate generator using a preloaded dictionary.
382///
383/// Suitable for small KBs or testing.
384///
385/// # Similarity Metrics
386///
387/// The generator supports multiple similarity metrics via `with_metric()`:
388///
389/// ```rust
390/// use anno::linking::{DictionaryCandidateGenerator, SimilarityMetric};
391///
392/// let gen = DictionaryCandidateGenerator::new()
393///     .with_metric(SimilarityMetric::EditDistance)
394///     .with_well_known();
395///
396/// // For OCR'd text with damage markers:
397/// let gen_ocr = DictionaryCandidateGenerator::new()
398///     .with_metric(SimilarityMetric::EditDistanceWildcard)
399///     .with_well_known();
400/// ```
401#[derive(Debug, Clone, Default)]
402pub struct DictionaryCandidateGenerator {
403    /// Map from normalized surface form to candidates
404    entries: HashMap<String, Vec<Candidate>>,
405    /// Similarity metric for fuzzy matching
406    metric: SimilarityMetric,
407}
408
409impl DictionaryCandidateGenerator {
410    /// Create a new dictionary generator.
411    pub fn new() -> Self {
412        Self::default()
413    }
414
415    /// Set the similarity metric for fuzzy matching.
416    ///
417    /// # Example
418    ///
419    /// ```rust
420    /// use anno::linking::{DictionaryCandidateGenerator, SimilarityMetric};
421    ///
422    /// // For damaged historical text with wildcards
423    /// let gen = DictionaryCandidateGenerator::new()
424    ///     .with_metric(SimilarityMetric::EditDistanceWildcard);
425    /// ```
426    pub fn with_metric(mut self, metric: SimilarityMetric) -> Self {
427        self.metric = metric;
428        self
429    }
430
431    /// Get the current similarity metric.
432    #[must_use]
433    pub fn metric(&self) -> SimilarityMetric {
434        self.metric
435    }
436
437    /// Add an entry to the dictionary.
438    pub fn add_entry(&mut self, surface: &str, candidate: Candidate) {
439        let normalized = surface.to_lowercase();
440        self.entries.entry(normalized).or_default().push(candidate);
441    }
442
443    /// Load well-known entities (for demo/testing).
444    ///
445    /// Includes diverse entities across cultures and languages per multilingual guidelines.
446    /// Coverage: Western, East Asian, Middle Eastern, African, South Asian, Latin American.
447    pub fn with_well_known(mut self) -> Self {
448        let well_known = [
449            // === Scientists (diverse origins) ===
450            ("albert einstein", "Q937", "theoretical physicist"),
451            ("marie curie", "Q7186", "physicist and chemist"),
452            (
453                "tu youyou",
454                "Q546079",
455                "Chinese pharmacologist, Nobel laureate",
456            ),
457            ("屠呦呦", "Q546079", "Chinese pharmacologist"),
458            ("c.v. raman", "Q201010", "Indian physicist, Nobel laureate"),
459            (
460                "abdus salam",
461                "Q108365",
462                "Pakistani physicist, Nobel laureate",
463            ),
464            (
465                "wangari maathai",
466                "Q180728",
467                "Kenyan environmentalist, Nobel laureate",
468            ),
469            // === Political figures (global) ===
470            ("barack obama", "Q76", "44th President of the United States"),
471            ("angela merkel", "Q567", "Chancellor of Germany"),
472            ("習近平", "Q15031", "General Secretary of CCP"),
473            ("xi jinping", "Q15031", "General Secretary of CCP"),
474            ("narendra modi", "Q1058", "Prime Minister of India"),
475            ("नरेन्द्र मोदी", "Q1058", "Prime Minister of India"),
476            ("محمد بن سلمان", "Q6889872", "Crown Prince of Saudi Arabia"),
477            (
478                "mohammed bin salman",
479                "Q6889872",
480                "Crown Prince of Saudi Arabia",
481            ),
482            ("cyril ramaphosa", "Q312910", "President of South Africa"),
483            ("lula da silva", "Q37181", "President of Brazil"),
484            // === Technology companies (global) ===
485            ("google", "Q95", "American technology company"),
486            ("apple", "Q312", "American technology company"),
487            ("microsoft", "Q2283", "American technology company"),
488            ("alibaba", "Q306717", "Chinese technology company"),
489            ("阿里巴巴", "Q306717", "Chinese technology company"),
490            ("tencent", "Q860580", "Chinese technology company"),
491            ("腾讯", "Q860580", "Chinese technology company"),
492            ("samsung", "Q20718", "South Korean conglomerate"),
493            ("삼성", "Q20718", "South Korean conglomerate"),
494            ("tata", "Q752289", "Indian conglomerate"),
495            ("infosys", "Q723418", "Indian technology company"),
496            // === Cities (global coverage) ===
497            ("new york", "Q60", "city in New York State"),
498            ("london", "Q84", "capital of the United Kingdom"),
499            ("paris", "Q90", "capital of France"),
500            ("berlin", "Q64", "capital of Germany"),
501            ("tokyo", "Q1490", "capital of Japan"),
502            ("東京", "Q1490", "capital of Japan"),
503            ("beijing", "Q956", "capital of China"),
504            ("北京", "Q956", "capital of China"),
505            ("mumbai", "Q1156", "financial capital of India"),
506            ("मुंबई", "Q1156", "financial capital of India"),
507            ("cairo", "Q85", "capital of Egypt"),
508            ("القاهرة", "Q85", "capital of Egypt"),
509            ("são paulo", "Q174", "largest city in Brazil"),
510            ("lagos", "Q8673", "largest city in Nigeria"),
511            ("москва", "Q649", "capital of Russia"),
512            ("moscow", "Q649", "capital of Russia"),
513            ("dubai", "Q612", "city in UAE"),
514            ("دبي", "Q612", "city in UAE"),
515            ("singapore", "Q334", "city-state in Southeast Asia"),
516            ("新加坡", "Q334", "city-state in Southeast Asia"),
517            // === International organizations ===
518            ("united nations", "Q1065", "international organization"),
519            ("european union", "Q458", "political and economic union"),
520            (
521                "world health organization",
522                "Q7817",
523                "UN specialized agency",
524            ),
525            ("who", "Q7817", "World Health Organization"),
526            ("nato", "Q7184", "North Atlantic Treaty Organization"),
527            (
528                "african union",
529                "Q7159",
530                "continental union of African states",
531            ),
532            ("asean", "Q7768", "Association of Southeast Asian Nations"),
533            (
534                "opec",
535                "Q7795",
536                "Organization of the Petroleum Exporting Countries",
537            ),
538            // === Historical figures (diverse) ===
539            ("confucius", "Q4604", "Chinese philosopher"),
540            ("孔子", "Q4604", "Chinese philosopher"),
541            ("mahatma gandhi", "Q1001", "Indian independence leader"),
542            ("महात्मा गांधी", "Q1001", "Indian independence leader"),
543            (
544                "nelson mandela",
545                "Q8023",
546                "South African anti-apartheid leader",
547            ),
548            ("cleopatra", "Q635", "last Pharaoh of Egypt"),
549            ("genghis khan", "Q720", "founder of Mongol Empire"),
550            ("成吉思汗", "Q720", "founder of Mongol Empire"),
551            // === Cultural figures (diverse) ===
552            ("pelé", "Q12897", "Brazilian footballer"),
553            ("shakira", "Q34424", "Colombian singer"),
554            ("bts", "Q485927", "South Korean boy band"),
555            ("방탄소년단", "Q485927", "South Korean boy band"),
556            ("宮崎駿", "Q55400", "Japanese animator"),
557            ("hayao miyazaki", "Q55400", "Japanese animator"),
558        ];
559
560        for (surface, qid, desc) in well_known {
561            let candidate = Candidate::new(qid, CandidateSource::Wikidata, surface)
562                .with_description(desc)
563                .with_prior(0.5);
564            self.add_entry(surface, candidate);
565        }
566
567        self
568    }
569}
570
571impl CandidateGenerator for DictionaryCandidateGenerator {
572    fn generate(
573        &self,
574        mention: &str,
575        _context: &str,
576        _entity_type: Option<&str>,
577        limit: usize,
578    ) -> Vec<Candidate> {
579        let normalized = mention.to_lowercase();
580
581        // Exact match (skip wildcards for exact matching)
582        if !mention.contains('?') && !mention.contains('*') {
583            if let Some(candidates) = self.entries.get(&normalized) {
584                return candidates.iter().take(limit).cloned().collect();
585            }
586        }
587
588        // Fuzzy match using configured similarity metric
589        let mut results: Vec<Candidate> =
590            self.entries.iter().flat_map(|(_, v)| v.clone()).collect();
591
592        // Score by configured similarity metric
593        for c in &mut results {
594            c.string_sim = self.metric.compute(mention, &c.label);
595            c.compute_score();
596        }
597
598        // Filter out very low similarity matches
599        results.retain(|c| c.string_sim > 0.1);
600
601        results.sort_by(|a, b| {
602            b.score
603                .partial_cmp(&a.score)
604                .unwrap_or(std::cmp::Ordering::Equal)
605        });
606        results.truncate(limit);
607
608        results
609    }
610
611    fn name(&self) -> &'static str {
612        "dictionary"
613    }
614}
615
616/// Compute simple string similarity (Jaccard on words + char trigrams).
617pub fn string_similarity(a: &str, b: &str) -> f64 {
618    // Keep this module's policy (weights) local, but reuse shared primitives.
619    //
620    // Default: delegate to `textprep` (self-contained within the `anno` repo).
621    // Opt-in: `--features gramdex` uses `gramdex::trigram_jaccard` for the trigram
622    // component so we can compare performance and semantics in benches.
623    #[cfg(feature = "gramdex")]
624    {
625        string_similarity_gramdex(a, b)
626    }
627    #[cfg(not(feature = "gramdex"))]
628    {
629        string_similarity_textprep(a, b)
630    }
631}
632
633/// `textprep`-backed implementation of [`string_similarity`].
634#[must_use]
635pub fn string_similarity_textprep(a: &str, b: &str) -> f64 {
636    textprep::similarity::weighted_word_char_ngram_jaccard(a, b, 3, 0.6, 0.4)
637}
638
639/// `gramdex`-backed implementation of [`string_similarity`].
640#[cfg(feature = "gramdex")]
641#[must_use]
642pub fn string_similarity_gramdex(a: &str, b: &str) -> f64 {
643    // Mirror the `textprep` weighted policy (word Jaccard + char trigram Jaccard).
644    //
645    // Note: `gramdex::trigram_jaccard`'s convention is 1.0 when both inputs have
646    // zero trigrams (length < 3). We keep this behind a feature until we decide
647    // whether we want to match `textprep` semantics exactly for short strings.
648    let word = textprep::similarity::word_jaccard(a, b);
649    let tri = gramdex::trigram_jaccard(a, b) as f64;
650    0.6 * word + 0.4 * tri
651}
652
653/// Parse year from an ISO 8601 date string.
654///
655/// Handles common formats:
656/// - "1990-01-15" → Some(1990)
657/// - "1990" → Some(1990)
658/// - "-0044" → Some(-44) (BCE dates)
659fn parse_year(date: &str) -> Option<i32> {
660    let trimmed = date.trim();
661    if trimmed.is_empty() {
662        return None;
663    }
664
665    // Handle BCE dates (negative years)
666    let (sign, rest) = if let Some(rest) = trimmed.strip_prefix('-') {
667        (-1, rest)
668    } else {
669        (1, trimmed)
670    };
671
672    // Extract year portion (first 4 digits or until first dash)
673    let year_str = rest.split('-').next()?;
674    let year: i32 = year_str.parse().ok()?;
675    Some(sign * year)
676}
677
678/// Type compatibility scoring.
679pub fn type_compatibility(ner_type: Option<&str>, kb_type: Option<&str>) -> f64 {
680    match (ner_type, kb_type) {
681        (None, _) | (_, None) => 1.0, // No constraint
682        (Some(n), Some(k)) => {
683            let n_lower = n.to_lowercase();
684            let k_lower = k.to_lowercase();
685
686            // Direct match
687            if n_lower == k_lower {
688                return 1.0;
689            }
690
691            // Person type compatibility
692            if (n_lower.contains("person") || n_lower == "per")
693                && (k_lower.contains("human") || k_lower.contains("person"))
694            {
695                return 0.95;
696            }
697
698            // Organization type compatibility
699            if (n_lower.contains("org") || n_lower == "organization")
700                && (k_lower.contains("organization")
701                    || k_lower.contains("company")
702                    || k_lower.contains("institution"))
703            {
704                return 0.9;
705            }
706
707            // Location type compatibility
708            if (n_lower.contains("loc") || n_lower.contains("gpe") || n_lower == "location")
709                && (k_lower.contains("city")
710                    || k_lower.contains("country")
711                    || k_lower.contains("place")
712                    || k_lower.contains("location"))
713            {
714                return 0.9;
715            }
716
717            // Mismatch
718            0.3
719        }
720    }
721}
722
723#[cfg(test)]
724mod tests {
725    use super::*;
726
727    #[test]
728    fn test_dictionary_generator() {
729        let gen = DictionaryCandidateGenerator::new().with_well_known();
730
731        // Exact match
732        let candidates = gen.generate("albert einstein", "", None, 5);
733        assert!(!candidates.is_empty());
734        assert!(candidates[0].kb_id == "Q937");
735
736        // Partial match might not work depending on fuzzy logic
737        // The generator is designed for exact/close matches
738        let partial = gen.generate("Einstein", "", None, 5);
739        // Partial may or may not match - that's OK for dictionary-based
740        let _ = partial; // Just verify it doesn't panic
741    }
742
743    #[test]
744    fn test_string_similarity() {
745        // Keep the baseline behavior stable even if `--features gramdex` is enabled.
746        assert!(string_similarity_textprep("Albert Einstein", "Einstein") > 0.3);
747        assert!(string_similarity_textprep("Albert Einstein", "Albert Einstein") > 0.99);
748        assert!(string_similarity_textprep("New York", "New York City") > 0.5);
749    }
750
751    #[cfg(feature = "gramdex")]
752    #[test]
753    fn test_string_similarity_gramdex_bounds() {
754        let sim = string_similarity_gramdex("Albert Einstein", "Einstein");
755        assert!((0.0..=1.0).contains(&sim));
756    }
757
758    #[test]
759    fn test_type_compatibility() {
760        assert!(type_compatibility(Some("PERSON"), Some("human")) > 0.9);
761        assert!(type_compatibility(Some("ORG"), Some("company")) > 0.8);
762        assert!(type_compatibility(Some("PERSON"), Some("city")) < 0.5);
763    }
764
765    #[test]
766    fn test_candidate_iri() {
767        let c = Candidate::new("Q937", CandidateSource::Wikidata, "Einstein");
768        assert_eq!(c.to_iri(), "http://www.wikidata.org/entity/Q937");
769    }
770
771    #[test]
772    fn test_parse_year() {
773        assert_eq!(parse_year("1990-01-15"), Some(1990));
774        assert_eq!(parse_year("1990"), Some(1990));
775        assert_eq!(parse_year("-0044"), Some(-44)); // Julius Caesar
776        assert_eq!(parse_year(""), None);
777    }
778
779    #[test]
780    fn test_temporal_compatibility() {
781        // George H.W. Bush: 1924-2018
782        let bush_sr = Candidate::new("Q23505", CandidateSource::Wikidata, "George H. W. Bush")
783            .with_valid_from("1924-06-12")
784            .with_valid_until("2018-11-30");
785
786        // George W. Bush: 1946-present
787        let bush_jr = Candidate::new("Q207", CandidateSource::Wikidata, "George W. Bush")
788            .with_valid_from("1946-07-06");
789
790        // In 1990, both were alive - no temporal penalty
791        assert!(bush_sr.temporal_compatibility("1990-01-01") > 0.9);
792        assert!(bush_jr.temporal_compatibility("1990-01-01") > 0.9);
793
794        // In 2020, Bush Sr. had died 2 years prior - slight penalty
795        let sr_compat_2020 = bush_sr.temporal_compatibility("2020-01-01");
796        assert!(sr_compat_2020 < 1.0);
797        assert!(sr_compat_2020 > 0.5);
798
799        // Bush Jr. still alive - no penalty
800        assert!(bush_jr.temporal_compatibility("2020-01-01") > 0.9);
801    }
802
803    #[test]
804    fn test_compute_score_with_temporal() {
805        // Historical figure: Julius Caesar
806        let mut caesar = Candidate::new("Q1048", CandidateSource::Wikidata, "Julius Caesar")
807            .with_valid_from("-0100-07-12")
808            .with_valid_until("-0044-03-15")
809            .with_prior(0.9);
810        caesar.string_sim = 0.9;
811
812        // Score without temporal context
813        caesar.compute_score();
814        let base_score = caesar.score;
815
816        // Score with document from 50 BCE (Caesar was alive)
817        caesar.compute_score_with_temporal(Some("-0050-01-01"));
818        let ancient_score = caesar.score;
819
820        // Score with modern document (Caesar long dead)
821        caesar.compute_score_with_temporal(Some("2024-01-01"));
822        let modern_score = caesar.score;
823
824        // Ancient document should have higher score than modern
825        assert!(ancient_score > modern_score);
826        // Both should be below or equal to base
827        assert!(ancient_score <= base_score || (ancient_score - base_score).abs() < 0.01);
828    }
829
830    // -------------------------------------------------------------------------
831    // Similarity Metric Tests
832    // -------------------------------------------------------------------------
833
834    #[test]
835    fn test_similarity_metric_jaccard() {
836        let metric = SimilarityMetric::Jaccard;
837        assert!(metric.compute("hello world", "hello world") > 0.99);
838        assert!(metric.compute("hello world", "hello") > 0.3);
839    }
840
841    #[test]
842    fn test_similarity_metric_edit_distance() {
843        let metric = SimilarityMetric::EditDistance;
844        assert!(metric.compute("Einstein", "Einstein") > 0.99);
845        assert!(metric.compute("Einstein", "Einstien") > 0.7); // Typo
846        assert!(metric.compute("Einstein", "Newton") < 0.5);
847    }
848
849    #[test]
850    fn test_similarity_metric_damerau() {
851        let metric = SimilarityMetric::DamerauLevenshtein;
852        // Transpositions are common typos
853        assert!(metric.compute("teh", "the") > 0.6);
854        assert!(metric.compute("recieve", "receive") > 0.8);
855    }
856
857    #[test]
858    fn test_similarity_metric_wildcard() {
859        let metric = SimilarityMetric::EditDistanceWildcard;
860
861        // Wildcards in query should match
862        assert!(metric.compute("R?ma", "Roma") > 0.99);
863        assert!(metric.compute("Ein*", "Einstein") > 0.99);
864        assert!(metric.compute("*stein", "Einstein") > 0.99);
865
866        // Damaged inscription pattern
867        assert!(metric.compute("???TOR", "CASTOR") > 0.99);
868    }
869
870    #[test]
871    fn test_similarity_metric_from_str() {
872        assert_eq!(
873            SimilarityMetric::parse_str("jaccard"),
874            Some(SimilarityMetric::Jaccard)
875        );
876        assert_eq!(
877            SimilarityMetric::parse_str("edit-distance"),
878            Some(SimilarityMetric::EditDistance)
879        );
880        assert_eq!(
881            SimilarityMetric::parse_str("lev"),
882            Some(SimilarityMetric::EditDistance)
883        );
884        assert_eq!(
885            SimilarityMetric::parse_str("wildcard"),
886            Some(SimilarityMetric::EditDistanceWildcard)
887        );
888        assert_eq!(SimilarityMetric::parse_str("unknown"), None);
889    }
890
891    #[test]
892    fn test_generator_with_edit_distance() {
893        let gen = DictionaryCandidateGenerator::new()
894            .with_metric(SimilarityMetric::EditDistance)
895            .with_well_known();
896
897        // Test exact match first
898        let candidates = gen.generate("Albert Einstein", "", None, 5);
899        assert!(!candidates.is_empty());
900        assert!(candidates
901            .iter()
902            .any(|c| c.label.to_lowercase().contains("einstein")));
903
904        // Fuzzy match with typo - may not return results with small dictionaries
905        // due to similarity threshold filtering
906        let typo_candidates = gen.generate("Einstien", "", None, 5);
907        // Just verify it doesn't panic - results depend on dictionary size and threshold
908        let _ = typo_candidates;
909    }
910
911    #[test]
912    fn test_generator_with_wildcard() {
913        let gen = DictionaryCandidateGenerator::new()
914            .with_metric(SimilarityMetric::EditDistanceWildcard)
915            .with_well_known();
916
917        // Wildcards for damaged text: "M?r?e C*" should match "Marie Curie"
918        let candidates = gen.generate("marie c*", "", None, 10);
919        assert!(!candidates.is_empty());
920        assert!(candidates
921            .iter()
922            .any(|c| c.label.to_lowercase().contains("curie")));
923    }
924
925    #[test]
926    fn test_similarity_metric_cjk() {
927        // Test edit distance handles CJK correctly
928        let metric = SimilarityMetric::EditDistance;
929
930        // 北京 (Beijing) vs 北平 (old name) - 1 char diff
931        let sim = metric.compute("北京", "北平");
932        assert!(sim > 0.4 && sim < 0.9, "CJK similarity: {}", sim);
933
934        // Identical CJK
935        assert!(metric.compute("東京", "東京") > 0.99);
936    }
937}
938
939// =============================================================================
940// Property Tests
941// =============================================================================
942
943#[cfg(test)]
944mod proptests {
945    use super::*;
946    use proptest::prelude::*;
947
948    // Strategy for short strings (for expensive operations)
949    fn arb_short_string() -> impl Strategy<Value = String> {
950        prop::string::string_regex("[a-zA-Z0-9 ]{0,30}").unwrap()
951    }
952
953    // Strategy for entity-like strings
954    fn arb_entity_name() -> impl Strategy<Value = String> {
955        prop::string::string_regex("[A-Z][a-z]+ [A-Z][a-z]+")
956            .unwrap()
957            .prop_filter("non-empty", |s| !s.is_empty())
958    }
959
960    // Strategy for similarity metrics
961    fn arb_metric() -> impl Strategy<Value = SimilarityMetric> {
962        prop_oneof![
963            Just(SimilarityMetric::Jaccard),
964            Just(SimilarityMetric::EditDistance),
965            Just(SimilarityMetric::DamerauLevenshtein),
966            Just(SimilarityMetric::EditDistanceWildcard),
967        ]
968    }
969
970    // -------------------------------------------------------------------------
971    // SimilarityMetric Properties
972    // -------------------------------------------------------------------------
973
974    proptest! {
975        /// All metrics return values in [0, 1]
976        #[test]
977        fn prop_metric_bounds(metric in arb_metric(), a in arb_short_string(), b in arb_short_string()) {
978            let sim = metric.compute(&a, &b);
979            prop_assert!(
980                (0.0..=1.0).contains(&sim),
981                "Similarity {} out of [0,1] for {:?}",
982                sim,
983                metric
984            );
985        }
986
987        /// All metrics give 1.0 for identical strings
988        #[test]
989        fn prop_metric_identity(metric in arb_metric(), s in arb_short_string()) {
990            let sim = metric.compute(&s, &s);
991            prop_assert!(
992                (sim - 1.0).abs() < 1e-10,
993                "Identity similarity should be 1.0, got {} for {:?}", sim, metric
994            );
995        }
996
997        /// Jaccard, EditDistance, DamerauLevenshtein are symmetric
998        /// (EditDistanceWildcard is NOT symmetric due to pattern matching)
999        #[test]
1000        fn prop_symmetric_metrics_symmetric(a in arb_short_string(), b in arb_short_string()) {
1001            for metric in [
1002                SimilarityMetric::Jaccard,
1003                SimilarityMetric::EditDistance,
1004                SimilarityMetric::DamerauLevenshtein,
1005            ] {
1006                let sim1 = metric.compute(&a, &b);
1007                let sim2 = metric.compute(&b, &a);
1008                prop_assert!(
1009                    (sim1 - sim2).abs() < 1e-10,
1010                    "{:?} not symmetric: ({},{})={} vs ({},{})={}", metric, a, b, sim1, b, a, sim2
1011                );
1012            }
1013        }
1014
1015        /// Metric name round-trips through from_str
1016        #[test]
1017        fn prop_metric_name_roundtrip(metric in arb_metric()) {
1018            let name = metric.name();
1019            if let Some(recovered) = SimilarityMetric::parse_str(name) {
1020                prop_assert_eq!(metric, recovered);
1021            }
1022            // Some names might not round-trip exactly (aliases)
1023        }
1024
1025        /// Empty string has similarity 1.0 with itself
1026        #[test]
1027        fn prop_metric_empty_identity(metric in arb_metric()) {
1028            let sim = metric.compute("", "");
1029            prop_assert!(
1030                (sim - 1.0).abs() < 1e-10,
1031                "Empty string identity should be 1.0, got {} for {:?}", sim, metric
1032            );
1033        }
1034    }
1035
1036    // -------------------------------------------------------------------------
1037    // Candidate Properties
1038    // -------------------------------------------------------------------------
1039
1040    proptest! {
1041        /// Candidate score is in [0, 1]
1042        #[test]
1043        fn prop_candidate_score_bounds(
1044            kb_id in "[A-Z][0-9]+",
1045            label in arb_entity_name(),
1046            string_sim in 0.0f64..1.0,
1047            prior in 0.0f64..1.0
1048        ) {
1049            let mut candidate = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1050            candidate.string_sim = string_sim;
1051            candidate.prior = prior;
1052            candidate.compute_score();
1053
1054            prop_assert!(
1055                candidate.score >= 0.0 && candidate.score <= 1.0,
1056                "Score {} out of [0,1]", candidate.score
1057            );
1058        }
1059
1060        /// Candidate kb_id is deterministic
1061        #[test]
1062        fn prop_candidate_kb_id_deterministic(
1063            kb_id in "[A-Z][0-9]+",
1064            label in arb_entity_name()
1065        ) {
1066            let c1 = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1067            let c2 = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1068            prop_assert_eq!(c1.kb_id, c2.kb_id);
1069        }
1070
1071        /// Candidate serde round-trip
1072        #[test]
1073        fn prop_candidate_serde_roundtrip(
1074            kb_id in "[A-Z][0-9]+",
1075            label in arb_entity_name()
1076        ) {
1077            let candidate = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1078            let json = serde_json::to_string(&candidate).unwrap();
1079            let recovered: Candidate = serde_json::from_str(&json).unwrap();
1080
1081            prop_assert_eq!(candidate.kb_id, recovered.kb_id);
1082            prop_assert_eq!(candidate.label, recovered.label);
1083        }
1084    }
1085
1086    // -------------------------------------------------------------------------
1087    // DictionaryCandidateGenerator Properties
1088    // -------------------------------------------------------------------------
1089
1090    proptest! {
1091        /// Generator always returns <= limit candidates
1092        #[test]
1093        fn prop_generator_respects_limit(
1094            mention in arb_entity_name(),
1095            limit in 1usize..20
1096        ) {
1097            let gen = DictionaryCandidateGenerator::new().with_well_known();
1098            let candidates = gen.generate(&mention, "", None, limit);
1099            prop_assert!(
1100                candidates.len() <= limit,
1101                "Got {} candidates but limit was {}", candidates.len(), limit
1102            );
1103        }
1104
1105        /// Generator name is consistent
1106        #[test]
1107        fn prop_generator_name_consistent(metric in arb_metric()) {
1108            let gen = DictionaryCandidateGenerator::new().with_metric(metric);
1109            let name = gen.name();
1110            prop_assert!(!name.is_empty());
1111        }
1112
1113        /// Generator with metric round-trips metric correctly
1114        #[test]
1115        fn prop_generator_metric_set(metric in arb_metric()) {
1116            let gen = DictionaryCandidateGenerator::new().with_metric(metric);
1117            prop_assert_eq!(gen.metric(), metric);
1118        }
1119
1120        /// Candidates are sorted by score descending
1121        #[test]
1122        fn prop_candidates_sorted_descending(mention in arb_entity_name()) {
1123            let gen = DictionaryCandidateGenerator::new().with_well_known();
1124            let candidates = gen.generate(&mention, "", None, 10);
1125
1126            for i in 1..candidates.len() {
1127                prop_assert!(
1128                    candidates[i-1].score >= candidates[i].score,
1129                    "Candidates not sorted: {} < {} at positions {}-{}",
1130                    candidates[i-1].score, candidates[i].score, i-1, i
1131                );
1132            }
1133        }
1134    }
1135
1136    // -------------------------------------------------------------------------
1137    // CandidateSource Properties
1138    // -------------------------------------------------------------------------
1139
1140    proptest! {
1141        /// CandidateSource serde round-trip
1142        #[test]
1143        fn prop_source_serde_roundtrip(source in prop_oneof![
1144            Just(CandidateSource::Wikidata),
1145            Just(CandidateSource::YAGO),
1146            Just(CandidateSource::DBpedia),
1147            Just(CandidateSource::Wikipedia),
1148            Just(CandidateSource::Freebase),
1149            Just(CandidateSource::UMLS),
1150            Just(CandidateSource::GeoNames),
1151        ]) {
1152            let json = serde_json::to_string(&source).unwrap();
1153            let recovered: CandidateSource = serde_json::from_str(&json).unwrap();
1154            prop_assert_eq!(source, recovered);
1155        }
1156
1157        /// Custom source round-trips
1158        #[test]
1159        fn prop_custom_source_roundtrip(name in "[a-z]+") {
1160            let source = CandidateSource::Custom(name.clone());
1161            let json = serde_json::to_string(&source).unwrap();
1162            let recovered: CandidateSource = serde_json::from_str(&json).unwrap();
1163
1164            if let CandidateSource::Custom(n) = recovered {
1165                prop_assert_eq!(name, n);
1166            } else {
1167                prop_assert!(false, "Expected Custom variant");
1168            }
1169        }
1170    }
1171}