1use serde::{Deserialize, Serialize};
61use std::collections::HashMap;
62
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
69pub enum KnowledgeBase {
70 Wikidata,
72 YAGO,
74 DBpedia,
76 Wikipedia,
78 Freebase,
80 UMLS,
82 GeoNames,
84 SchemaOrg,
86 OpenCyc,
88 Custom,
90}
91
92impl KnowledgeBase {
93 #[must_use]
95 pub fn base_uri(&self) -> &'static str {
96 match self {
97 Self::Wikidata => "http://www.wikidata.org/entity/",
98 Self::YAGO => "http://yago-knowledge.org/resource/",
99 Self::DBpedia => "http://dbpedia.org/resource/",
100 Self::Wikipedia => "https://en.wikipedia.org/wiki/",
101 Self::Freebase => "http://rdf.freebase.com/ns/",
102 Self::UMLS => "https://uts.nlm.nih.gov/uts/umls/concept/",
103 Self::GeoNames => "https://sws.geonames.org/",
104 Self::SchemaOrg => "https://schema.org/",
105 Self::OpenCyc => "http://sw.opencyc.org/concept/",
106 Self::Custom => "",
107 }
108 }
109
110 #[must_use]
112 pub fn sparql_endpoint(&self) -> Option<&'static str> {
113 match self {
114 Self::Wikidata => Some("https://query.wikidata.org/sparql"),
115 Self::DBpedia => Some("https://dbpedia.org/sparql"),
116 Self::YAGO => Some("https://yago-knowledge.org/sparql/query"),
117 _ => None,
118 }
119 }
120
121 #[must_use]
123 pub fn search_api(&self) -> Option<&'static str> {
124 match self {
125 Self::Wikidata => Some("https://www.wikidata.org/w/api.php"),
126 Self::Wikipedia => Some("https://en.wikipedia.org/w/api.php"),
127 Self::GeoNames => Some("http://api.geonames.org/searchJSON"),
128 _ => None,
129 }
130 }
131
132 #[must_use]
134 pub fn is_active(&self) -> bool {
135 !matches!(self, Self::Freebase | Self::OpenCyc)
136 }
137
138 #[must_use]
140 pub fn same_as_predicate(&self) -> &'static str {
141 "http://www.w3.org/2002/07/owl#sameAs"
142 }
143}
144
145impl std::fmt::Display for KnowledgeBase {
146 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147 match self {
148 Self::Wikidata => write!(f, "Wikidata"),
149 Self::YAGO => write!(f, "YAGO"),
150 Self::DBpedia => write!(f, "DBpedia"),
151 Self::Wikipedia => write!(f, "Wikipedia"),
152 Self::Freebase => write!(f, "Freebase"),
153 Self::UMLS => write!(f, "UMLS"),
154 Self::GeoNames => write!(f, "GeoNames"),
155 Self::SchemaOrg => write!(f, "Schema.org"),
156 Self::OpenCyc => write!(f, "OpenCyc"),
157 Self::Custom => write!(f, "Custom"),
158 }
159 }
160}
161
162#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
168pub struct EntityURI {
169 pub kb: KnowledgeBase,
171 pub local_id: String,
173 pub uri: String,
175 pub label: Option<String>,
177}
178
179impl EntityURI {
180 pub fn new(kb: KnowledgeBase, local_id: &str) -> Self {
182 let uri = format!("{}{}", kb.base_uri(), local_id);
183 Self {
184 kb,
185 local_id: local_id.to_string(),
186 uri,
187 label: None,
188 }
189 }
190
191 pub fn with_label(mut self, label: &str) -> Self {
193 self.label = Some(label.to_string());
194 self
195 }
196
197 pub fn parse(uri: &str) -> Option<Self> {
199 for kb in &[
201 KnowledgeBase::Wikidata,
202 KnowledgeBase::YAGO,
203 KnowledgeBase::DBpedia,
204 KnowledgeBase::Wikipedia,
205 KnowledgeBase::Freebase,
206 KnowledgeBase::UMLS,
207 KnowledgeBase::GeoNames,
208 KnowledgeBase::SchemaOrg,
209 ] {
210 if uri.starts_with(kb.base_uri()) {
211 let local_id = &uri[kb.base_uri().len()..];
212 return Some(Self {
213 kb: *kb,
214 local_id: local_id.to_string(),
215 uri: uri.to_string(),
216 label: None,
217 });
218 }
219 }
220 None
221 }
222
223 #[must_use]
225 pub fn is_wikidata(&self) -> bool {
226 self.kb == KnowledgeBase::Wikidata && self.local_id.starts_with('Q')
227 }
228
229 #[must_use]
231 pub fn to_curie(&self) -> String {
232 let prefix = match self.kb {
233 KnowledgeBase::Wikidata => "wd",
234 KnowledgeBase::YAGO => "yago",
235 KnowledgeBase::DBpedia => "dbr",
236 KnowledgeBase::Wikipedia => "wp",
237 KnowledgeBase::Freebase => "fb",
238 KnowledgeBase::UMLS => "umls",
239 KnowledgeBase::GeoNames => "gn",
240 KnowledgeBase::SchemaOrg => "schema",
241 KnowledgeBase::OpenCyc => "cyc",
242 KnowledgeBase::Custom => "custom",
243 };
244 format!("{}:{}", prefix, self.local_id)
245 }
246}
247
248#[derive(Debug, Clone, Default)]
256pub struct CrossKBMapper {
257 wikidata_mappings: HashMap<String, Vec<EntityURI>>,
259 reverse_mappings: HashMap<String, String>, }
262
263impl CrossKBMapper {
264 pub fn new() -> Self {
266 Self::default()
267 }
268
269 pub fn add_mapping(&mut self, wikidata_qid: &str, other_uri: EntityURI) {
271 self.reverse_mappings
272 .insert(other_uri.uri.clone(), wikidata_qid.to_string());
273 self.wikidata_mappings
274 .entry(wikidata_qid.to_string())
275 .or_default()
276 .push(other_uri);
277 }
278
279 pub fn get_uris(&self, wikidata_qid: &str) -> Vec<&EntityURI> {
281 self.wikidata_mappings
282 .get(wikidata_qid)
283 .map(|v| v.iter().collect())
284 .unwrap_or_default()
285 }
286
287 pub fn to_wikidata(&self, uri: &str) -> Option<&str> {
289 self.reverse_mappings.get(uri).map(|s| s.as_str())
290 }
291
292 #[must_use]
294 pub fn with_common_mappings() -> Self {
295 let mut mapper = Self::new();
296
297 mapper.add_mapping(
299 "Q937",
300 EntityURI::new(KnowledgeBase::DBpedia, "Albert_Einstein"),
301 );
302 mapper.add_mapping(
303 "Q937",
304 EntityURI::new(KnowledgeBase::YAGO, "Albert_Einstein"),
305 );
306 mapper.add_mapping("Q937", EntityURI::new(KnowledgeBase::Freebase, "m.0jcx"));
307 mapper.add_mapping(
308 "Q937",
309 EntityURI::new(KnowledgeBase::Wikipedia, "Albert_Einstein"),
310 );
311
312 mapper.add_mapping("Q312", EntityURI::new(KnowledgeBase::DBpedia, "Apple_Inc."));
314 mapper.add_mapping("Q312", EntityURI::new(KnowledgeBase::YAGO, "Apple_Inc."));
315 mapper.add_mapping("Q312", EntityURI::new(KnowledgeBase::Freebase, "m.0k8z"));
316
317 mapper.add_mapping(
319 "Q60",
320 EntityURI::new(KnowledgeBase::DBpedia, "New_York_City"),
321 );
322 mapper.add_mapping("Q60", EntityURI::new(KnowledgeBase::YAGO, "New_York_City"));
323 mapper.add_mapping("Q60", EntityURI::new(KnowledgeBase::GeoNames, "5128581"));
324
325 mapper.add_mapping(
327 "Q30",
328 EntityURI::new(KnowledgeBase::DBpedia, "United_States"),
329 );
330 mapper.add_mapping("Q30", EntityURI::new(KnowledgeBase::YAGO, "United_States"));
331 mapper.add_mapping("Q30", EntityURI::new(KnowledgeBase::GeoNames, "6252001"));
332
333 mapper
334 }
335}
336
337#[derive(Debug, Clone, Serialize, Deserialize)]
343pub struct YAGOEntity {
344 pub yago_id: String,
346 pub label: String,
348 pub types: Vec<String>,
350 pub wordnet_synset: Option<String>,
352 pub geonames_id: Option<String>,
354 pub wikidata_qid: Option<String>,
356 pub wikipedia_article: Option<String>,
358}
359
360impl YAGOEntity {
361 pub fn new(yago_id: &str, label: &str) -> Self {
363 Self {
364 yago_id: yago_id.to_string(),
365 label: label.to_string(),
366 types: Vec::new(),
367 wordnet_synset: None,
368 geonames_id: None,
369 wikidata_qid: None,
370 wikipedia_article: None,
371 }
372 }
373
374 #[must_use]
376 pub fn uri(&self) -> String {
377 format!("{}{}", KnowledgeBase::YAGO.base_uri(), self.yago_id)
378 }
379
380 #[must_use]
382 pub fn is_person(&self) -> bool {
383 self.types
384 .iter()
385 .any(|t| t.contains("person") || t.contains("human") || t.contains("wordnet_person"))
386 }
387
388 #[must_use]
390 pub fn is_location(&self) -> bool {
391 self.types.iter().any(|t| {
392 t.contains("location")
393 || t.contains("place")
394 || t.contains("city")
395 || t.contains("country")
396 }) || self.geonames_id.is_some()
397 }
398
399 #[must_use]
401 pub fn is_organization(&self) -> bool {
402 self.types.iter().any(|t| {
403 t.contains("organization") || t.contains("company") || t.contains("institution")
404 })
405 }
406}
407
408#[derive(Debug, Clone)]
414pub struct NeuralLinkingConfig {
415 pub architecture: NeuralArchitecture,
417 pub max_candidates: usize,
419 pub use_reranker: bool,
421 pub confidence_threshold: f64,
423 pub zero_shot: bool,
425}
426
427impl Default for NeuralLinkingConfig {
428 fn default() -> Self {
429 Self {
430 architecture: NeuralArchitecture::BiEncoder,
431 max_candidates: 64,
432 use_reranker: true,
433 confidence_threshold: 0.5,
434 zero_shot: false,
435 }
436 }
437}
438
439#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
441pub enum NeuralArchitecture {
442 BiEncoder,
444 CrossEncoder,
446 ReFinED,
448 GENRE,
450 MGENRE,
452 EntQA,
454}
455
456impl NeuralArchitecture {
457 #[must_use]
459 pub fn model_id(&self) -> &'static str {
460 match self {
461 Self::BiEncoder => "facebook/blink-biencoder",
462 Self::CrossEncoder => "facebook/blink-crossencoder",
463 Self::ReFinED => "amazon/refined-wikipedia",
464 Self::GENRE => "facebook/genre-kilt",
465 Self::MGENRE => "facebook/mgenre-wiki",
466 Self::EntQA => "EntQA/entqa",
467 }
468 }
469
470 #[must_use]
472 pub fn supports_zero_shot(&self) -> bool {
473 matches!(self, Self::ReFinED | Self::GENRE | Self::MGENRE)
474 }
475}
476
477#[derive(Debug, Clone, Default)]
483pub struct UnifiedLinker {
484 enabled_kbs: Vec<KnowledgeBase>,
486 mapper: CrossKBMapper,
488 dictionary: HashMap<String, Vec<EntityURI>>, #[allow(dead_code)] neural_config: Option<NeuralLinkingConfig>,
493}
494
495impl UnifiedLinker {
496 pub fn builder() -> UnifiedLinkerBuilder {
498 UnifiedLinkerBuilder::default()
499 }
500
501 pub fn link_to_uris(&self, mention: &str, _entity_type: Option<&str>) -> Vec<EntityURI> {
503 let mention_lower = mention.to_lowercase();
504
505 if let Some(uris) = self.dictionary.get(&mention_lower) {
507 return uris
508 .iter()
509 .filter(|u| self.enabled_kbs.contains(&u.kb))
510 .cloned()
511 .collect();
512 }
513
514 Vec::new()
517 }
518
519 pub fn link_primary(&self, mention: &str, entity_type: Option<&str>) -> Option<EntityURI> {
521 self.link_to_uris(mention, entity_type)
522 .into_iter()
523 .find(|u| u.kb == KnowledgeBase::Wikidata)
524 }
525
526 pub fn expand_wikidata(&self, qid: &str) -> Vec<EntityURI> {
528 let mut uris = vec![EntityURI::new(KnowledgeBase::Wikidata, qid)];
529 uris.extend(self.mapper.get_uris(qid).iter().cloned().cloned());
530 uris
531 }
532}
533
534#[derive(Debug, Clone, Default)]
536pub struct UnifiedLinkerBuilder {
537 enabled_kbs: Vec<KnowledgeBase>,
538 use_common_mappings: bool,
539 neural_config: Option<NeuralLinkingConfig>,
540}
541
542impl UnifiedLinkerBuilder {
543 pub fn add_kb(mut self, kb: KnowledgeBase) -> Self {
545 if !self.enabled_kbs.contains(&kb) {
546 self.enabled_kbs.push(kb);
547 }
548 self
549 }
550
551 pub fn with_common_mappings(mut self) -> Self {
553 self.use_common_mappings = true;
554 self
555 }
556
557 pub fn with_neural_config(mut self, config: NeuralLinkingConfig) -> Self {
559 self.neural_config = Some(config);
560 self
561 }
562
563 pub fn build(self) -> UnifiedLinker {
565 let mapper = if self.use_common_mappings {
566 CrossKBMapper::with_common_mappings()
567 } else {
568 CrossKBMapper::new()
569 };
570
571 let enabled_kbs = if self.enabled_kbs.is_empty() {
572 vec![KnowledgeBase::Wikidata] } else {
574 self.enabled_kbs
575 };
576
577 UnifiedLinker {
578 enabled_kbs,
579 mapper,
580 dictionary: HashMap::new(),
581 neural_config: self.neural_config,
582 }
583 }
584}
585
586#[derive(Debug, Clone, Serialize, Deserialize)]
595pub struct NILCluster {
596 pub id: u64,
598 pub canonical: String,
600 pub surfaces: Vec<String>,
602 pub entity_type: Option<String>,
604 pub confidence: f64,
606 pub mention_count: usize,
608}
609
610impl NILCluster {
611 pub fn new(id: u64, canonical: &str) -> Self {
613 Self {
614 id,
615 canonical: canonical.to_string(),
616 surfaces: vec![canonical.to_string()],
617 entity_type: None,
618 confidence: 1.0,
619 mention_count: 1,
620 }
621 }
622
623 #[must_use]
625 pub fn temp_uri(&self) -> String {
626 format!("urn:nil:cluster:{}", self.id)
627 }
628}
629
630#[cfg(test)]
635mod tests {
636 use super::*;
637
638 #[test]
639 fn test_kb_uris() {
640 assert_eq!(
641 KnowledgeBase::Wikidata.base_uri(),
642 "http://www.wikidata.org/entity/"
643 );
644 assert_eq!(
645 KnowledgeBase::YAGO.base_uri(),
646 "http://yago-knowledge.org/resource/"
647 );
648 assert_eq!(
649 KnowledgeBase::DBpedia.base_uri(),
650 "http://dbpedia.org/resource/"
651 );
652 }
653
654 #[test]
655 fn test_entity_uri() {
656 let uri = EntityURI::new(KnowledgeBase::Wikidata, "Q937");
657 assert_eq!(uri.uri, "http://www.wikidata.org/entity/Q937");
658 assert_eq!(uri.to_curie(), "wd:Q937");
659 assert!(uri.is_wikidata());
660 }
661
662 #[test]
663 fn test_uri_parsing() {
664 let uri = EntityURI::parse("http://www.wikidata.org/entity/Q937");
665 assert!(uri.is_some());
666 let uri = uri.unwrap();
667 assert_eq!(uri.kb, KnowledgeBase::Wikidata);
668 assert_eq!(uri.local_id, "Q937");
669
670 let dbpedia = EntityURI::parse("http://dbpedia.org/resource/Albert_Einstein");
671 assert!(dbpedia.is_some());
672 assert_eq!(dbpedia.unwrap().kb, KnowledgeBase::DBpedia);
673 }
674
675 #[test]
676 fn test_cross_kb_mapper() {
677 let mapper = CrossKBMapper::with_common_mappings();
678
679 let uris = mapper.get_uris("Q937");
681 assert!(!uris.is_empty());
682
683 assert!(uris.iter().any(|u| u.kb == KnowledgeBase::DBpedia));
685 }
686
687 #[test]
688 fn test_yago_entity() {
689 let mut entity = YAGOEntity::new("Albert_Einstein", "Albert Einstein");
690 entity.types.push("wordnet_person_100007846".to_string());
691 entity.wikidata_qid = Some("Q937".to_string());
692
693 assert!(entity.is_person());
694 assert!(!entity.is_location());
695 assert_eq!(
696 entity.uri(),
697 "http://yago-knowledge.org/resource/Albert_Einstein"
698 );
699 }
700
701 #[test]
702 fn test_unified_linker() {
703 let linker = UnifiedLinker::builder()
704 .add_kb(KnowledgeBase::Wikidata)
705 .add_kb(KnowledgeBase::DBpedia)
706 .with_common_mappings()
707 .build();
708
709 let uris = linker.expand_wikidata("Q937");
711 assert!(!uris.is_empty());
712 assert!(uris.iter().any(|u| u.kb == KnowledgeBase::Wikidata));
713 }
714
715 #[test]
716 fn test_neural_architecture() {
717 assert!(NeuralArchitecture::GENRE.supports_zero_shot());
718 assert!(!NeuralArchitecture::BiEncoder.supports_zero_shot());
719 }
720
721 #[test]
722 fn test_nil_cluster() {
723 let cluster = NILCluster::new(1, "John Doe");
724 assert!(cluster.temp_uri().starts_with("urn:nil:"));
725 }
726}