1use serde::{Deserialize, Serialize};
37use std::collections::HashMap;
38
39#[derive(Debug, Clone)]
45pub struct WikidataConfig {
46 pub api_endpoint: String,
48 pub max_candidates: usize,
50 pub min_score: f64,
52 pub languages: Vec<String>,
54 pub timeout_secs: u64,
56 pub enable_cache: bool,
58 pub cache_ttl: u64,
60}
61
62impl Default for WikidataConfig {
63 fn default() -> Self {
64 Self {
65 api_endpoint: "https://www.wikidata.org/w/api.php".to_string(),
66 max_candidates: 10,
67 min_score: 0.0,
68 languages: vec!["en".to_string(), "de".to_string(), "fr".to_string()],
69 timeout_secs: 10,
70 enable_cache: true,
71 cache_ttl: 3600, }
73 }
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct WikidataEntity {
83 pub qid: String,
85 pub label: String,
87 pub description: Option<String>,
89 pub aliases: Vec<String>,
91 pub instance_of: Vec<String>,
93 pub subclass_of: Vec<String>,
95 pub sitelinks: u32,
97 pub entity_type: Option<WikidataNERType>,
99 pub wikipedia_url: Option<String>,
101 pub image_url: Option<String>,
103}
104
105impl WikidataEntity {
106 pub fn new(qid: &str, label: &str) -> Self {
108 Self {
109 qid: qid.to_string(),
110 label: label.to_string(),
111 description: None,
112 aliases: Vec::new(),
113 instance_of: Vec::new(),
114 subclass_of: Vec::new(),
115 sitelinks: 0,
116 entity_type: None,
117 wikipedia_url: None,
118 image_url: None,
119 }
120 }
121
122 #[must_use]
124 pub fn iri(&self) -> String {
125 format!("http://www.wikidata.org/entity/{}", self.qid)
126 }
127
128 #[must_use]
130 pub fn matches_mention(&self, mention: &str) -> bool {
131 let mention_lower = mention.to_lowercase();
132
133 if self.label.to_lowercase() == mention_lower {
134 return true;
135 }
136
137 self.aliases
138 .iter()
139 .any(|a| a.to_lowercase() == mention_lower)
140 }
141}
142
143#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
145pub enum WikidataNERType {
146 Person,
148 Organization,
150 Location,
152 GeopoliticalEntity,
154 Event,
156 Product,
158 WorkOfArt,
160 DateTime,
162 Miscellaneous,
164}
165
166impl WikidataNERType {
167 #[must_use]
169 pub fn to_entity_type_str(&self) -> &'static str {
170 match self {
171 Self::Person => "PER",
172 Self::Organization => "ORG",
173 Self::Location => "LOC",
174 Self::GeopoliticalEntity => "GPE",
175 Self::Event => "EVENT",
176 Self::Product => "PRODUCT",
177 Self::WorkOfArt => "WORK_OF_ART",
178 Self::DateTime => "DATE",
179 Self::Miscellaneous => "MISC",
180 }
181 }
182}
183
184#[derive(Debug, Clone, Default)]
190pub struct WikidataTypeMapper {
191 mappings: HashMap<String, WikidataNERType>,
193}
194
195impl WikidataTypeMapper {
196 #[must_use]
198 pub fn new() -> Self {
199 let mut mappings = HashMap::new();
200
201 mappings.insert("Q5".to_string(), WikidataNERType::Person); mappings.insert("Q215627".to_string(), WikidataNERType::Person); mappings.insert("Q95074".to_string(), WikidataNERType::Person); mappings.insert("Q43229".to_string(), WikidataNERType::Organization); mappings.insert("Q4830453".to_string(), WikidataNERType::Organization); mappings.insert("Q783794".to_string(), WikidataNERType::Organization); mappings.insert("Q891723".to_string(), WikidataNERType::Organization); mappings.insert("Q3918".to_string(), WikidataNERType::Organization); mappings.insert("Q7278".to_string(), WikidataNERType::Organization); mappings.insert("Q476028".to_string(), WikidataNERType::Organization); mappings.insert("Q327333".to_string(), WikidataNERType::Organization); mappings.insert("Q515".to_string(), WikidataNERType::Location); mappings.insert("Q532".to_string(), WikidataNERType::Location); mappings.insert("Q5084".to_string(), WikidataNERType::Location); mappings.insert("Q1549591".to_string(), WikidataNERType::Location); mappings.insert("Q486972".to_string(), WikidataNERType::Location); mappings.insert("Q82794".to_string(), WikidataNERType::Location); mappings.insert("Q46831".to_string(), WikidataNERType::Location); mappings.insert("Q8502".to_string(), WikidataNERType::Location); mappings.insert("Q4022".to_string(), WikidataNERType::Location); mappings.insert("Q23397".to_string(), WikidataNERType::Location); mappings.insert("Q6256".to_string(), WikidataNERType::GeopoliticalEntity); mappings.insert("Q3624078".to_string(), WikidataNERType::GeopoliticalEntity); mappings.insert("Q7275".to_string(), WikidataNERType::GeopoliticalEntity); mappings.insert("Q35657".to_string(), WikidataNERType::GeopoliticalEntity); mappings.insert("Q1656682".to_string(), WikidataNERType::Event); mappings.insert("Q18669875".to_string(), WikidataNERType::Event); mappings.insert("Q198".to_string(), WikidataNERType::Event); mappings.insert("Q11483816".to_string(), WikidataNERType::Event); mappings.insert("Q2424752".to_string(), WikidataNERType::Product); mappings.insert("Q35127".to_string(), WikidataNERType::Product); mappings.insert("Q7889".to_string(), WikidataNERType::Product); mappings.insert("Q22811662".to_string(), WikidataNERType::Product); mappings.insert("Q838948".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q571".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q11424".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q7725634".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q105543609".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q134556".to_string(), WikidataNERType::WorkOfArt); mappings.insert("Q482994".to_string(), WikidataNERType::WorkOfArt); Self { mappings }
256 }
257
258 #[must_use]
260 pub fn map_type(&self, qid: &str) -> Option<WikidataNERType> {
261 self.mappings.get(qid).copied()
262 }
263
264 #[must_use]
266 pub fn map_types(&self, qids: &[String]) -> Option<WikidataNERType> {
267 let priority = [
269 WikidataNERType::Person,
270 WikidataNERType::GeopoliticalEntity,
271 WikidataNERType::Organization,
272 WikidataNERType::Location,
273 WikidataNERType::Event,
274 WikidataNERType::WorkOfArt,
275 WikidataNERType::Product,
276 ];
277
278 for ptype in &priority {
279 for qid in qids {
280 if let Some(mapped) = self.map_type(qid) {
281 if &mapped == ptype {
282 return Some(mapped);
283 }
284 }
285 }
286 }
287
288 for qid in qids {
290 if let Some(mapped) = self.map_type(qid) {
291 return Some(mapped);
292 }
293 }
294
295 None
296 }
297
298 pub fn add_mapping(&mut self, qid: &str, ner_type: WikidataNERType) {
300 self.mappings.insert(qid.to_string(), ner_type);
301 }
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
310pub struct WikidataSearchResult {
311 pub qid: String,
313 pub label: String,
315 pub description: Option<String>,
317 pub score: f64,
319 pub exact_match: bool,
321}
322
323#[derive(Debug, Clone, Default)]
331pub struct WikidataDictionary {
332 by_label: HashMap<String, Vec<WikidataEntity>>,
334 by_qid: HashMap<String, WikidataEntity>,
336 type_mapper: WikidataTypeMapper,
338}
339
340impl WikidataDictionary {
341 #[must_use]
343 pub fn new() -> Self {
344 Self {
345 by_label: HashMap::new(),
346 by_qid: HashMap::new(),
347 type_mapper: WikidataTypeMapper::new(),
348 }
349 }
350
351 pub fn add_entity(&mut self, mut entity: WikidataEntity) {
353 entity.entity_type = self.type_mapper.map_types(&entity.instance_of);
355
356 let label_key = entity.label.to_lowercase();
358 self.by_label
359 .entry(label_key)
360 .or_default()
361 .push(entity.clone());
362
363 for alias in &entity.aliases {
365 let alias_key = alias.to_lowercase();
366 self.by_label
367 .entry(alias_key)
368 .or_default()
369 .push(entity.clone());
370 }
371
372 self.by_qid.insert(entity.qid.clone(), entity);
374 }
375
376 #[must_use]
378 pub fn lookup(&self, mention: &str) -> Vec<&WikidataEntity> {
379 let key = mention.to_lowercase();
380 self.by_label
381 .get(&key)
382 .map_or(Vec::new(), |v| v.iter().collect())
383 }
384
385 #[must_use]
387 pub fn get(&self, qid: &str) -> Option<&WikidataEntity> {
388 self.by_qid.get(qid)
389 }
390
391 #[must_use]
393 pub fn len(&self) -> usize {
394 self.by_qid.len()
395 }
396
397 #[must_use]
399 pub fn is_empty(&self) -> bool {
400 self.by_qid.is_empty()
401 }
402
403 #[must_use]
405 pub fn link(
406 &self,
407 mention: &str,
408 expected_type: Option<WikidataNERType>,
409 ) -> Option<&WikidataEntity> {
410 let candidates = self.lookup(mention);
411
412 if candidates.is_empty() {
413 return None;
414 }
415
416 if let Some(etype) = expected_type {
418 let filtered: Vec<_> = candidates
419 .iter()
420 .filter(|e| e.entity_type == Some(etype))
421 .copied()
422 .collect();
423
424 if !filtered.is_empty() {
425 return filtered.into_iter().max_by_key(|e| e.sitelinks);
427 }
428 }
429
430 candidates.into_iter().max_by_key(|e| e.sitelinks)
432 }
433
434 #[must_use]
436 pub fn with_common_entities() -> Self {
437 let mut dict = Self::new();
438
439 let entities = vec![
441 WikidataEntity {
442 qid: "Q937".to_string(),
443 label: "Albert Einstein".to_string(),
444 description: Some("German-born theoretical physicist".to_string()),
445 aliases: vec!["Einstein".to_string(), "A. Einstein".to_string()],
446 instance_of: vec!["Q5".to_string()],
447 subclass_of: Vec::new(),
448 sitelinks: 500,
449 entity_type: Some(WikidataNERType::Person),
450 wikipedia_url: Some("https://en.wikipedia.org/wiki/Albert_Einstein".to_string()),
451 image_url: None,
452 },
453 WikidataEntity {
454 qid: "Q312".to_string(),
455 label: "Apple Inc.".to_string(),
456 description: Some("American multinational technology company".to_string()),
457 aliases: vec!["Apple".to_string(), "Apple Computer".to_string()],
458 instance_of: vec!["Q4830453".to_string()],
459 subclass_of: Vec::new(),
460 sitelinks: 400,
461 entity_type: Some(WikidataNERType::Organization),
462 wikipedia_url: Some("https://en.wikipedia.org/wiki/Apple_Inc.".to_string()),
463 image_url: None,
464 },
465 WikidataEntity {
466 qid: "Q60".to_string(),
467 label: "New York City".to_string(),
468 description: Some("Most populous city in the United States".to_string()),
469 aliases: vec![
470 "NYC".to_string(),
471 "New York".to_string(),
472 "The Big Apple".to_string(),
473 ],
474 instance_of: vec!["Q515".to_string()],
475 subclass_of: Vec::new(),
476 sitelinks: 450,
477 entity_type: Some(WikidataNERType::Location),
478 wikipedia_url: Some("https://en.wikipedia.org/wiki/New_York_City".to_string()),
479 image_url: None,
480 },
481 WikidataEntity {
482 qid: "Q30".to_string(),
483 label: "United States of America".to_string(),
484 description: Some("Country primarily located in North America".to_string()),
485 aliases: vec![
486 "USA".to_string(),
487 "United States".to_string(),
488 "US".to_string(),
489 "America".to_string(),
490 ],
491 instance_of: vec!["Q6256".to_string()],
492 subclass_of: Vec::new(),
493 sitelinks: 550,
494 entity_type: Some(WikidataNERType::GeopoliticalEntity),
495 wikipedia_url: Some("https://en.wikipedia.org/wiki/United_States".to_string()),
496 image_url: None,
497 },
498 ];
499
500 for entity in entities {
501 dict.add_entity(entity);
502 }
503
504 dict
505 }
506}
507
508#[cfg(test)]
513mod tests {
514 use super::*;
515
516 #[test]
517 fn test_type_mapping() {
518 let mapper = WikidataTypeMapper::new();
519
520 assert_eq!(mapper.map_type("Q5"), Some(WikidataNERType::Person));
521 assert_eq!(
522 mapper.map_type("Q43229"),
523 Some(WikidataNERType::Organization)
524 );
525 assert_eq!(mapper.map_type("Q515"), Some(WikidataNERType::Location));
526 assert_eq!(
527 mapper.map_type("Q6256"),
528 Some(WikidataNERType::GeopoliticalEntity)
529 );
530 assert_eq!(mapper.map_type("Q99999999"), None);
531 }
532
533 #[test]
534 fn test_entity_matches_mention() {
535 let entity = WikidataEntity {
536 qid: "Q937".to_string(),
537 label: "Albert Einstein".to_string(),
538 description: None,
539 aliases: vec!["Einstein".to_string()],
540 instance_of: Vec::new(),
541 subclass_of: Vec::new(),
542 sitelinks: 0,
543 entity_type: None,
544 wikipedia_url: None,
545 image_url: None,
546 };
547
548 assert!(entity.matches_mention("Albert Einstein"));
549 assert!(entity.matches_mention("einstein")); assert!(entity.matches_mention("Einstein"));
551 assert!(!entity.matches_mention("Albert"));
552 }
553
554 #[test]
555 fn test_dictionary_lookup() {
556 let dict = WikidataDictionary::with_common_entities();
557
558 let results = dict.lookup("Albert Einstein");
560 assert_eq!(results.len(), 1);
561 assert_eq!(results[0].qid, "Q937");
562
563 let results = dict.lookup("Einstein");
565 assert_eq!(results.len(), 1);
566
567 let results = dict.lookup("EINSTEIN");
569 assert_eq!(results.len(), 1);
570
571 let results = dict.lookup("Nonexistent Entity");
573 assert!(results.is_empty());
574 }
575
576 #[test]
577 fn test_dictionary_link_with_type() {
578 let dict = WikidataDictionary::with_common_entities();
579
580 let linked = dict.link("Apple", Some(WikidataNERType::Organization));
582 assert!(linked.is_some());
583 assert_eq!(linked.unwrap().qid, "Q312");
584 }
585
586 #[test]
587 fn test_qid_lookup() {
588 let dict = WikidataDictionary::with_common_entities();
589
590 let entity = dict.get("Q60");
591 assert!(entity.is_some());
592 assert_eq!(entity.unwrap().label, "New York City");
593 }
594
595 #[test]
596 fn test_entity_iri() {
597 let entity = WikidataEntity::new("Q937", "Albert Einstein");
598 assert_eq!(entity.iri(), "http://www.wikidata.org/entity/Q937");
599 }
600
601 #[test]
602 fn test_ner_type_to_str() {
603 assert_eq!(WikidataNERType::Person.to_entity_type_str(), "PER");
604 assert_eq!(WikidataNERType::Organization.to_entity_type_str(), "ORG");
605 assert_eq!(WikidataNERType::Location.to_entity_type_str(), "LOC");
606 assert_eq!(
607 WikidataNERType::GeopoliticalEntity.to_entity_type_str(),
608 "GPE"
609 );
610 }
611}