Skip to main content

anno/linking/
wikidata.rs

1//! Wikidata utilities (offline).
2//!
3//! This module is intentionally **offline**: it does not call Wikidata’s APIs.
4//! It provides:
5//! - a small in-memory dictionary (`WikidataDictionary`) for demos/tests
6//! - type mapping helpers (`WikidataTypeMapper`, `WikidataNERType`)
7//!
8//! # Example
9//!
10//! ```rust
11//! use anno::linking::wikidata::WikidataDictionary;
12//!
13//! let dict = WikidataDictionary::with_common_entities();
14//! let cands = dict.lookup("Einstein");
15//! assert!(!cands.is_empty());
16//! ```
17//!
18//! # Wikidata Type Mapping
19//!
20//! ```text
21//! Wikidata Instance-of        → NER Type
22//! ─────────────────────────────────────────
23//! Q5 (human)                  → PER
24//! Q43229 (organization)       → ORG
25//! Q4830453 (business)         → ORG
26//! Q515 (city)                 → LOC
27//! Q6256 (country)             → LOC/GPE
28//! Q35127 (website)            → PRODUCT
29//! Q571 (book)                 → WORK_OF_ART
30//! Q11424 (film)               → WORK_OF_ART
31//! ```
32//!
33//! Note: if you need real Wikidata API integration, treat it as an external dependency and
34//! keep network behavior explicit in your application.
35
36use serde::{Deserialize, Serialize};
37use std::collections::HashMap;
38
39// =============================================================================
40// Configuration
41// =============================================================================
42
43/// Configuration for Wikidata linking.
44#[derive(Debug, Clone)]
45pub struct WikidataConfig {
46    /// Wikidata API endpoint
47    pub api_endpoint: String,
48    /// Maximum candidates to retrieve
49    pub max_candidates: usize,
50    /// Minimum search score threshold
51    pub min_score: f64,
52    /// Languages for label retrieval (priority order)
53    pub languages: Vec<String>,
54    /// Request timeout in seconds
55    pub timeout_secs: u64,
56    /// Enable caching
57    pub enable_cache: bool,
58    /// Cache TTL in seconds
59    pub cache_ttl: u64,
60}
61
62impl Default for WikidataConfig {
63    fn default() -> Self {
64        Self {
65            api_endpoint: "https://www.wikidata.org/w/api.php".to_string(),
66            max_candidates: 10,
67            min_score: 0.0,
68            languages: vec!["en".to_string(), "de".to_string(), "fr".to_string()],
69            timeout_secs: 10,
70            enable_cache: true,
71            cache_ttl: 3600, // 1 hour
72        }
73    }
74}
75
76// =============================================================================
77// Entity Types
78// =============================================================================
79
80/// A Wikidata entity (Q-item).
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct WikidataEntity {
83    /// Q-identifier (e.g., "Q937")
84    pub qid: String,
85    /// Primary label in preferred language
86    pub label: String,
87    /// Short description
88    pub description: Option<String>,
89    /// Alternative names/aliases
90    pub aliases: Vec<String>,
91    /// Instance-of types (Q-IDs)
92    pub instance_of: Vec<String>,
93    /// Subclass-of types (Q-IDs)  
94    pub subclass_of: Vec<String>,
95    /// Number of Wikipedia sitelinks (popularity proxy)
96    pub sitelinks: u32,
97    /// Mapped NER entity type
98    pub entity_type: Option<WikidataNERType>,
99    /// Wikipedia URL (if available)
100    pub wikipedia_url: Option<String>,
101    /// Image URL (if available)
102    pub image_url: Option<String>,
103}
104
105impl WikidataEntity {
106    /// Create a new entity.
107    pub fn new(qid: &str, label: &str) -> Self {
108        Self {
109            qid: qid.to_string(),
110            label: label.to_string(),
111            description: None,
112            aliases: Vec::new(),
113            instance_of: Vec::new(),
114            subclass_of: Vec::new(),
115            sitelinks: 0,
116            entity_type: None,
117            wikipedia_url: None,
118            image_url: None,
119        }
120    }
121
122    /// Get the Wikidata IRI.
123    #[must_use]
124    pub fn iri(&self) -> String {
125        format!("http://www.wikidata.org/entity/{}", self.qid)
126    }
127
128    /// Check if entity matches a mention (label or alias).
129    #[must_use]
130    pub fn matches_mention(&self, mention: &str) -> bool {
131        let mention_lower = mention.to_lowercase();
132
133        if self.label.to_lowercase() == mention_lower {
134            return true;
135        }
136
137        self.aliases
138            .iter()
139            .any(|a| a.to_lowercase() == mention_lower)
140    }
141}
142
143/// Mapped NER type from Wikidata.
144#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
145pub enum WikidataNERType {
146    /// Person (Q5: human)
147    Person,
148    /// Organization (Q43229, Q4830453, etc.)
149    Organization,
150    /// Location (Q515, Q6256, etc.)
151    Location,
152    /// Geopolitical entity (Q6256, Q3624078)
153    GeopoliticalEntity,
154    /// Event (Q1656682, Q18669875)
155    Event,
156    /// Product (Q2424752)
157    Product,
158    /// Work of art (Q838948)
159    WorkOfArt,
160    /// Date/time entity
161    DateTime,
162    /// Miscellaneous/other
163    Miscellaneous,
164}
165
166impl WikidataNERType {
167    /// Convert to standard anno EntityType string.
168    #[must_use]
169    pub fn to_entity_type_str(&self) -> &'static str {
170        match self {
171            Self::Person => "PER",
172            Self::Organization => "ORG",
173            Self::Location => "LOC",
174            Self::GeopoliticalEntity => "GPE",
175            Self::Event => "EVENT",
176            Self::Product => "PRODUCT",
177            Self::WorkOfArt => "WORK_OF_ART",
178            Self::DateTime => "DATE",
179            Self::Miscellaneous => "MISC",
180        }
181    }
182}
183
184// =============================================================================
185// Type Mapping
186// =============================================================================
187
188/// Maps Wikidata types (Q-IDs) to NER types.
189#[derive(Debug, Clone, Default)]
190pub struct WikidataTypeMapper {
191    /// Q-ID to NER type mapping
192    mappings: HashMap<String, WikidataNERType>,
193}
194
195impl WikidataTypeMapper {
196    /// Create a new mapper with default mappings.
197    #[must_use]
198    pub fn new() -> Self {
199        let mut mappings = HashMap::new();
200
201        // Person types
202        mappings.insert("Q5".to_string(), WikidataNERType::Person); // human
203        mappings.insert("Q215627".to_string(), WikidataNERType::Person); // person
204        mappings.insert("Q95074".to_string(), WikidataNERType::Person); // fictional character
205
206        // Organization types
207        mappings.insert("Q43229".to_string(), WikidataNERType::Organization); // organization
208        mappings.insert("Q4830453".to_string(), WikidataNERType::Organization); // business
209        mappings.insert("Q783794".to_string(), WikidataNERType::Organization); // company
210        mappings.insert("Q891723".to_string(), WikidataNERType::Organization); // public company
211        mappings.insert("Q3918".to_string(), WikidataNERType::Organization); // university
212        mappings.insert("Q7278".to_string(), WikidataNERType::Organization); // political party
213        mappings.insert("Q476028".to_string(), WikidataNERType::Organization); // sports club
214        mappings.insert("Q327333".to_string(), WikidataNERType::Organization); // government agency
215
216        // Location types
217        mappings.insert("Q515".to_string(), WikidataNERType::Location); // city
218        mappings.insert("Q532".to_string(), WikidataNERType::Location); // village
219        mappings.insert("Q5084".to_string(), WikidataNERType::Location); // hamlet
220        mappings.insert("Q1549591".to_string(), WikidataNERType::Location); // big city
221        mappings.insert("Q486972".to_string(), WikidataNERType::Location); // human settlement
222        mappings.insert("Q82794".to_string(), WikidataNERType::Location); // geographic region
223        mappings.insert("Q46831".to_string(), WikidataNERType::Location); // mountain range
224        mappings.insert("Q8502".to_string(), WikidataNERType::Location); // mountain
225        mappings.insert("Q4022".to_string(), WikidataNERType::Location); // river
226        mappings.insert("Q23397".to_string(), WikidataNERType::Location); // lake
227
228        // Geopolitical entity types
229        mappings.insert("Q6256".to_string(), WikidataNERType::GeopoliticalEntity); // country
230        mappings.insert("Q3624078".to_string(), WikidataNERType::GeopoliticalEntity); // sovereign state
231        mappings.insert("Q7275".to_string(), WikidataNERType::GeopoliticalEntity); // state
232        mappings.insert("Q35657".to_string(), WikidataNERType::GeopoliticalEntity); // administrative territorial entity
233
234        // Event types
235        mappings.insert("Q1656682".to_string(), WikidataNERType::Event); // event
236        mappings.insert("Q18669875".to_string(), WikidataNERType::Event); // recurring event
237        mappings.insert("Q198".to_string(), WikidataNERType::Event); // war
238        mappings.insert("Q11483816".to_string(), WikidataNERType::Event); // natural disaster
239
240        // Product types
241        mappings.insert("Q2424752".to_string(), WikidataNERType::Product); // product
242        mappings.insert("Q35127".to_string(), WikidataNERType::Product); // website
243        mappings.insert("Q7889".to_string(), WikidataNERType::Product); // video game
244        mappings.insert("Q22811662".to_string(), WikidataNERType::Product); // mobile app
245
246        // Work of art types
247        mappings.insert("Q838948".to_string(), WikidataNERType::WorkOfArt); // work of art
248        mappings.insert("Q571".to_string(), WikidataNERType::WorkOfArt); // book
249        mappings.insert("Q11424".to_string(), WikidataNERType::WorkOfArt); // film
250        mappings.insert("Q7725634".to_string(), WikidataNERType::WorkOfArt); // literary work
251        mappings.insert("Q105543609".to_string(), WikidataNERType::WorkOfArt); // musical work
252        mappings.insert("Q134556".to_string(), WikidataNERType::WorkOfArt); // single (music)
253        mappings.insert("Q482994".to_string(), WikidataNERType::WorkOfArt); // album
254
255        Self { mappings }
256    }
257
258    /// Map a Wikidata type Q-ID to NER type.
259    #[must_use]
260    pub fn map_type(&self, qid: &str) -> Option<WikidataNERType> {
261        self.mappings.get(qid).copied()
262    }
263
264    /// Map multiple types, returning the most specific match.
265    #[must_use]
266    pub fn map_types(&self, qids: &[String]) -> Option<WikidataNERType> {
267        // Priority: Person > GPE > Org > Loc > Event > Work > Product > Misc
268        let priority = [
269            WikidataNERType::Person,
270            WikidataNERType::GeopoliticalEntity,
271            WikidataNERType::Organization,
272            WikidataNERType::Location,
273            WikidataNERType::Event,
274            WikidataNERType::WorkOfArt,
275            WikidataNERType::Product,
276        ];
277
278        for ptype in &priority {
279            for qid in qids {
280                if let Some(mapped) = self.map_type(qid) {
281                    if &mapped == ptype {
282                        return Some(mapped);
283                    }
284                }
285            }
286        }
287
288        // Check if any match
289        for qid in qids {
290            if let Some(mapped) = self.map_type(qid) {
291                return Some(mapped);
292            }
293        }
294
295        None
296    }
297
298    /// Add a custom mapping.
299    pub fn add_mapping(&mut self, qid: &str, ner_type: WikidataNERType) {
300        self.mappings.insert(qid.to_string(), ner_type);
301    }
302}
303
304// =============================================================================
305// Search Result
306// =============================================================================
307
308/// A search result from Wikidata.
309#[derive(Debug, Clone, Serialize, Deserialize)]
310pub struct WikidataSearchResult {
311    /// Q-identifier
312    pub qid: String,
313    /// Primary label
314    pub label: String,
315    /// Short description
316    pub description: Option<String>,
317    /// Match score
318    pub score: f64,
319    /// Whether this is an exact match
320    pub exact_match: bool,
321}
322
323// =============================================================================
324// Linker (Offline/Dictionary-based)
325// =============================================================================
326
327/// Offline Wikidata linker using a pre-built dictionary.
328///
329/// For production use without API calls.
330#[derive(Debug, Clone, Default)]
331pub struct WikidataDictionary {
332    /// Entity lookup by label (lowercase)
333    by_label: HashMap<String, Vec<WikidataEntity>>,
334    /// Entity lookup by Q-ID
335    by_qid: HashMap<String, WikidataEntity>,
336    /// Type mapper
337    type_mapper: WikidataTypeMapper,
338}
339
340impl WikidataDictionary {
341    /// Create a new empty dictionary.
342    #[must_use]
343    pub fn new() -> Self {
344        Self {
345            by_label: HashMap::new(),
346            by_qid: HashMap::new(),
347            type_mapper: WikidataTypeMapper::new(),
348        }
349    }
350
351    /// Add an entity to the dictionary.
352    pub fn add_entity(&mut self, mut entity: WikidataEntity) {
353        // Map type
354        entity.entity_type = self.type_mapper.map_types(&entity.instance_of);
355
356        // Index by label
357        let label_key = entity.label.to_lowercase();
358        self.by_label
359            .entry(label_key)
360            .or_default()
361            .push(entity.clone());
362
363        // Index by aliases
364        for alias in &entity.aliases {
365            let alias_key = alias.to_lowercase();
366            self.by_label
367                .entry(alias_key)
368                .or_default()
369                .push(entity.clone());
370        }
371
372        // Index by Q-ID
373        self.by_qid.insert(entity.qid.clone(), entity);
374    }
375
376    /// Look up entities by mention text.
377    #[must_use]
378    pub fn lookup(&self, mention: &str) -> Vec<&WikidataEntity> {
379        let key = mention.to_lowercase();
380        self.by_label
381            .get(&key)
382            .map_or(Vec::new(), |v| v.iter().collect())
383    }
384
385    /// Get entity by Q-ID.
386    #[must_use]
387    pub fn get(&self, qid: &str) -> Option<&WikidataEntity> {
388        self.by_qid.get(qid)
389    }
390
391    /// Number of entities in dictionary.
392    #[must_use]
393    pub fn len(&self) -> usize {
394        self.by_qid.len()
395    }
396
397    /// Check if dictionary is empty.
398    #[must_use]
399    pub fn is_empty(&self) -> bool {
400        self.by_qid.is_empty()
401    }
402
403    /// Link a mention to the best matching entity.
404    #[must_use]
405    pub fn link(
406        &self,
407        mention: &str,
408        expected_type: Option<WikidataNERType>,
409    ) -> Option<&WikidataEntity> {
410        let candidates = self.lookup(mention);
411
412        if candidates.is_empty() {
413            return None;
414        }
415
416        // If type expected, filter by type
417        if let Some(etype) = expected_type {
418            let filtered: Vec<_> = candidates
419                .iter()
420                .filter(|e| e.entity_type == Some(etype))
421                .copied()
422                .collect();
423
424            if !filtered.is_empty() {
425                // Return most popular from filtered
426                return filtered.into_iter().max_by_key(|e| e.sitelinks);
427            }
428        }
429
430        // Return most popular from all candidates
431        candidates.into_iter().max_by_key(|e| e.sitelinks)
432    }
433
434    /// Create dictionary with some well-known entities.
435    #[must_use]
436    pub fn with_common_entities() -> Self {
437        let mut dict = Self::new();
438
439        // Add some well-known entities for demonstration
440        let entities = vec![
441            WikidataEntity {
442                qid: "Q937".to_string(),
443                label: "Albert Einstein".to_string(),
444                description: Some("German-born theoretical physicist".to_string()),
445                aliases: vec!["Einstein".to_string(), "A. Einstein".to_string()],
446                instance_of: vec!["Q5".to_string()],
447                subclass_of: Vec::new(),
448                sitelinks: 500,
449                entity_type: Some(WikidataNERType::Person),
450                wikipedia_url: Some("https://en.wikipedia.org/wiki/Albert_Einstein".to_string()),
451                image_url: None,
452            },
453            WikidataEntity {
454                qid: "Q312".to_string(),
455                label: "Apple Inc.".to_string(),
456                description: Some("American multinational technology company".to_string()),
457                aliases: vec!["Apple".to_string(), "Apple Computer".to_string()],
458                instance_of: vec!["Q4830453".to_string()],
459                subclass_of: Vec::new(),
460                sitelinks: 400,
461                entity_type: Some(WikidataNERType::Organization),
462                wikipedia_url: Some("https://en.wikipedia.org/wiki/Apple_Inc.".to_string()),
463                image_url: None,
464            },
465            WikidataEntity {
466                qid: "Q60".to_string(),
467                label: "New York City".to_string(),
468                description: Some("Most populous city in the United States".to_string()),
469                aliases: vec![
470                    "NYC".to_string(),
471                    "New York".to_string(),
472                    "The Big Apple".to_string(),
473                ],
474                instance_of: vec!["Q515".to_string()],
475                subclass_of: Vec::new(),
476                sitelinks: 450,
477                entity_type: Some(WikidataNERType::Location),
478                wikipedia_url: Some("https://en.wikipedia.org/wiki/New_York_City".to_string()),
479                image_url: None,
480            },
481            WikidataEntity {
482                qid: "Q30".to_string(),
483                label: "United States of America".to_string(),
484                description: Some("Country primarily located in North America".to_string()),
485                aliases: vec![
486                    "USA".to_string(),
487                    "United States".to_string(),
488                    "US".to_string(),
489                    "America".to_string(),
490                ],
491                instance_of: vec!["Q6256".to_string()],
492                subclass_of: Vec::new(),
493                sitelinks: 550,
494                entity_type: Some(WikidataNERType::GeopoliticalEntity),
495                wikipedia_url: Some("https://en.wikipedia.org/wiki/United_States".to_string()),
496                image_url: None,
497            },
498        ];
499
500        for entity in entities {
501            dict.add_entity(entity);
502        }
503
504        dict
505    }
506}
507
508// =============================================================================
509// Tests
510// =============================================================================
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515
516    #[test]
517    fn test_type_mapping() {
518        let mapper = WikidataTypeMapper::new();
519
520        assert_eq!(mapper.map_type("Q5"), Some(WikidataNERType::Person));
521        assert_eq!(
522            mapper.map_type("Q43229"),
523            Some(WikidataNERType::Organization)
524        );
525        assert_eq!(mapper.map_type("Q515"), Some(WikidataNERType::Location));
526        assert_eq!(
527            mapper.map_type("Q6256"),
528            Some(WikidataNERType::GeopoliticalEntity)
529        );
530        assert_eq!(mapper.map_type("Q99999999"), None);
531    }
532
533    #[test]
534    fn test_entity_matches_mention() {
535        let entity = WikidataEntity {
536            qid: "Q937".to_string(),
537            label: "Albert Einstein".to_string(),
538            description: None,
539            aliases: vec!["Einstein".to_string()],
540            instance_of: Vec::new(),
541            subclass_of: Vec::new(),
542            sitelinks: 0,
543            entity_type: None,
544            wikipedia_url: None,
545            image_url: None,
546        };
547
548        assert!(entity.matches_mention("Albert Einstein"));
549        assert!(entity.matches_mention("einstein")); // Case insensitive
550        assert!(entity.matches_mention("Einstein"));
551        assert!(!entity.matches_mention("Albert"));
552    }
553
554    #[test]
555    fn test_dictionary_lookup() {
556        let dict = WikidataDictionary::with_common_entities();
557
558        // Exact match
559        let results = dict.lookup("Albert Einstein");
560        assert_eq!(results.len(), 1);
561        assert_eq!(results[0].qid, "Q937");
562
563        // Alias match
564        let results = dict.lookup("Einstein");
565        assert_eq!(results.len(), 1);
566
567        // Case insensitive
568        let results = dict.lookup("EINSTEIN");
569        assert_eq!(results.len(), 1);
570
571        // No match
572        let results = dict.lookup("Nonexistent Entity");
573        assert!(results.is_empty());
574    }
575
576    #[test]
577    fn test_dictionary_link_with_type() {
578        let dict = WikidataDictionary::with_common_entities();
579
580        // "Apple" could match org or fruit - with type filter it should return org
581        let linked = dict.link("Apple", Some(WikidataNERType::Organization));
582        assert!(linked.is_some());
583        assert_eq!(linked.unwrap().qid, "Q312");
584    }
585
586    #[test]
587    fn test_qid_lookup() {
588        let dict = WikidataDictionary::with_common_entities();
589
590        let entity = dict.get("Q60");
591        assert!(entity.is_some());
592        assert_eq!(entity.unwrap().label, "New York City");
593    }
594
595    #[test]
596    fn test_entity_iri() {
597        let entity = WikidataEntity::new("Q937", "Albert Einstein");
598        assert_eq!(entity.iri(), "http://www.wikidata.org/entity/Q937");
599    }
600
601    #[test]
602    fn test_ner_type_to_str() {
603        assert_eq!(WikidataNERType::Person.to_entity_type_str(), "PER");
604        assert_eq!(WikidataNERType::Organization.to_entity_type_str(), "ORG");
605        assert_eq!(WikidataNERType::Location.to_entity_type_str(), "LOC");
606        assert_eq!(
607            WikidataNERType::GeopoliticalEntity.to_entity_type_str(),
608            "GPE"
609        );
610    }
611}