1use serde::{Deserialize, Serialize};
36use std::collections::HashMap;
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
40pub enum ConfusableReason {
41 DisambiguationPage,
43 NameCollision,
45 TransliterationVariant,
47 OcrError,
49 TemporalAmbiguity,
51 FamilialAmbiguity,
53 NameChange,
55 FictionalVsReal,
57}
58
59#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ConfusableEntity {
62 pub kb_id: String,
64 pub label: String,
66 pub description: Option<String>,
68 pub reason: ConfusableReason,
70 pub entity_type: Option<String>,
72 pub difficulty: f64,
74}
75
76impl ConfusableEntity {
77 pub fn new(kb_id: &str, reason: ConfusableReason) -> Self {
79 Self {
80 kb_id: kb_id.to_string(),
81 label: String::new(),
82 description: None,
83 reason,
84 entity_type: None,
85 difficulty: 0.5,
86 }
87 }
88
89 pub fn with_label(mut self, label: &str) -> Self {
91 self.label = label.to_string();
92 self
93 }
94
95 pub fn with_description(mut self, desc: &str) -> Self {
97 self.description = Some(desc.to_string());
98 self
99 }
100
101 pub fn with_entity_type(mut self, entity_type: &str) -> Self {
103 self.entity_type = Some(entity_type.to_string());
104 self
105 }
106
107 pub fn with_difficulty(mut self, difficulty: f64) -> Self {
109 self.difficulty = difficulty.clamp(0.0, 1.0);
110 self
111 }
112}
113
114#[derive(Debug, Clone, Default, Serialize, Deserialize)]
116pub struct ConfusableSet {
117 pub surface_form: String,
119 pub entities: Vec<ConfusableEntity>,
121}
122
123impl ConfusableSet {
124 pub fn new(surface_form: &str) -> Self {
126 Self {
127 surface_form: surface_form.to_string(),
128 entities: Vec::new(),
129 }
130 }
131
132 pub fn add(&mut self, kb_id: &str, reason: ConfusableReason, description: Option<&str>) {
134 let mut entity = ConfusableEntity::new(kb_id, reason);
135 if let Some(desc) = description {
136 entity = entity.with_description(desc);
137 }
138 self.entities.push(entity);
139 }
140
141 pub fn add_entity(&mut self, entity: ConfusableEntity) {
143 self.entities.push(entity);
144 }
145
146 pub fn to_training_pairs(&self, positive_kb_id: &str) -> Vec<TrainingPair> {
151 self.entities
152 .iter()
153 .filter(|e| e.kb_id != positive_kb_id)
154 .map(|negative| TrainingPair {
155 surface_form: self.surface_form.clone(),
156 positive_kb_id: positive_kb_id.to_string(),
157 negative_kb_id: negative.kb_id.clone(),
158 negative_description: negative.description.clone(),
159 difficulty: negative.difficulty,
160 })
161 .collect()
162 }
163
164 pub fn len(&self) -> usize {
166 self.entities.len()
167 }
168
169 pub fn is_empty(&self) -> bool {
171 self.entities.is_empty()
172 }
173
174 pub fn filter_by_reason(&self, reason: &ConfusableReason) -> Vec<&ConfusableEntity> {
176 self.entities
177 .iter()
178 .filter(|e| &e.reason == reason)
179 .collect()
180 }
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct TrainingPair {
186 pub surface_form: String,
188 pub positive_kb_id: String,
190 pub negative_kb_id: String,
192 pub negative_description: Option<String>,
194 pub difficulty: f64,
196}
197
198#[derive(Debug, Clone, Default, Serialize, Deserialize)]
202pub struct ConfusableRegistry {
203 entries: HashMap<String, ConfusableSet>,
205}
206
207impl ConfusableRegistry {
208 pub fn new() -> Self {
210 Self::default()
211 }
212
213 pub fn add(&mut self, set: ConfusableSet) {
215 let key = set.surface_form.to_lowercase();
216 self.entries
217 .entry(key)
218 .and_modify(|existing| {
219 for entity in &set.entities {
220 if !existing.entities.iter().any(|e| e.kb_id == entity.kb_id) {
221 existing.entities.push(entity.clone());
222 }
223 }
224 })
225 .or_insert(set);
226 }
227
228 pub fn get(&self, surface_form: &str) -> Option<&ConfusableSet> {
230 self.entries.get(&surface_form.to_lowercase())
231 }
232
233 pub fn len(&self) -> usize {
235 self.entries.len()
236 }
237
238 pub fn is_empty(&self) -> bool {
240 self.entries.is_empty()
241 }
242
243 pub fn with_well_known(mut self) -> Self {
248 let mut john_smith = ConfusableSet::new("John Smith");
250 john_smith.add_entity(
251 ConfusableEntity::new("Q217557", ConfusableReason::DisambiguationPage)
252 .with_label("John Smith")
253 .with_description("English soldier and explorer, founder of Jamestown")
254 .with_entity_type("human")
255 .with_difficulty(0.8),
256 );
257 john_smith.add_entity(
258 ConfusableEntity::new("Q556859", ConfusableReason::DisambiguationPage)
259 .with_label("John Smith")
260 .with_description("British Labour Party leader")
261 .with_entity_type("human")
262 .with_difficulty(0.7),
263 );
264 john_smith.add_entity(
265 ConfusableEntity::new("Q23489016", ConfusableReason::FictionalVsReal)
266 .with_label("John Smith")
267 .with_description("Doctor Who character")
268 .with_entity_type("fictional character")
269 .with_difficulty(0.6),
270 );
271 self.add(john_smith);
272
273 let mut george_bush = ConfusableSet::new("George Bush");
275 george_bush.add_entity(
276 ConfusableEntity::new("Q207", ConfusableReason::TemporalAmbiguity)
277 .with_label("George W. Bush")
278 .with_description("43rd President of the United States")
279 .with_entity_type("human")
280 .with_difficulty(0.9),
281 );
282 george_bush.add_entity(
283 ConfusableEntity::new("Q23505", ConfusableReason::TemporalAmbiguity)
284 .with_label("George H. W. Bush")
285 .with_description("41st President of the United States")
286 .with_entity_type("human")
287 .with_difficulty(0.9),
288 );
289 self.add(george_bush);
290
291 let mut president_bush = ConfusableSet::new("President Bush");
293 president_bush.add_entity(
294 ConfusableEntity::new("Q207", ConfusableReason::TemporalAmbiguity)
295 .with_label("George W. Bush")
296 .with_description("43rd President of the United States")
297 .with_entity_type("human")
298 .with_difficulty(0.95),
299 );
300 president_bush.add_entity(
301 ConfusableEntity::new("Q23505", ConfusableReason::TemporalAmbiguity)
302 .with_label("George H. W. Bush")
303 .with_description("41st President of the United States")
304 .with_entity_type("human")
305 .with_difficulty(0.95),
306 );
307 self.add(president_bush);
308
309 let mut michael_jackson = ConfusableSet::new("Michael Jackson");
311 michael_jackson.add_entity(
312 ConfusableEntity::new("Q2831", ConfusableReason::DisambiguationPage)
313 .with_label("Michael Jackson")
314 .with_description("American singer-songwriter (1958-2009)")
315 .with_entity_type("human")
316 .with_difficulty(0.7),
317 );
318 michael_jackson.add_entity(
319 ConfusableEntity::new("Q318029", ConfusableReason::DisambiguationPage)
320 .with_label("Michael Jackson")
321 .with_description("English beer and whisky writer")
322 .with_entity_type("human")
323 .with_difficulty(0.6),
324 );
325 self.add(michael_jackson);
326
327 let mut apple = ConfusableSet::new("Apple");
329 apple.add_entity(
330 ConfusableEntity::new("Q312", ConfusableReason::NameCollision)
331 .with_label("Apple Inc.")
332 .with_description("American technology company")
333 .with_entity_type("organization")
334 .with_difficulty(0.7),
335 );
336 apple.add_entity(
337 ConfusableEntity::new("Q213710", ConfusableReason::NameCollision)
338 .with_label("Apple Records")
339 .with_description("Record label founded by the Beatles")
340 .with_entity_type("organization")
341 .with_difficulty(0.6),
342 );
343 self.add(apple);
344
345 let mut paris = ConfusableSet::new("Paris");
347 paris.add_entity(
348 ConfusableEntity::new("Q90", ConfusableReason::DisambiguationPage)
349 .with_label("Paris")
350 .with_description("Capital city of France")
351 .with_entity_type("city")
352 .with_difficulty(0.5),
353 );
354 paris.add_entity(
355 ConfusableEntity::new("Q167646", ConfusableReason::DisambiguationPage)
356 .with_label("Paris Hilton")
357 .with_description("American socialite and businesswoman")
358 .with_entity_type("human")
359 .with_difficulty(0.4),
360 );
361 paris.add_entity(
362 ConfusableEntity::new("Q167491", ConfusableReason::FictionalVsReal)
363 .with_label("Paris")
364 .with_description("Trojan prince in Greek mythology")
365 .with_entity_type("mythological figure")
366 .with_difficulty(0.6),
367 );
368 self.add(paris);
369
370 let mut beijing = ConfusableSet::new("Beijing");
372 beijing.add_entity(
373 ConfusableEntity::new("Q956", ConfusableReason::TransliterationVariant)
374 .with_label("Beijing")
375 .with_description("Capital of China (modern transliteration)")
376 .with_entity_type("city")
377 .with_difficulty(0.3),
378 );
379 self.add(beijing);
380
381 let mut peking = ConfusableSet::new("Peking");
382 peking.add_entity(
383 ConfusableEntity::new("Q956", ConfusableReason::TransliterationVariant)
384 .with_label("Beijing")
385 .with_description("Capital of China (historical transliteration)")
386 .with_entity_type("city")
387 .with_difficulty(0.3),
388 );
389 self.add(peking);
390
391 let mut washington = ConfusableSet::new("Washington");
393 washington.add_entity(
394 ConfusableEntity::new("Q23", ConfusableReason::DisambiguationPage)
395 .with_label("George Washington")
396 .with_description("1st President of the United States")
397 .with_entity_type("human")
398 .with_difficulty(0.7),
399 );
400 washington.add_entity(
401 ConfusableEntity::new("Q61", ConfusableReason::DisambiguationPage)
402 .with_label("Washington, D.C.")
403 .with_description("Capital of the United States")
404 .with_entity_type("city")
405 .with_difficulty(0.6),
406 );
407 washington.add_entity(
408 ConfusableEntity::new("Q1223", ConfusableReason::DisambiguationPage)
409 .with_label("Washington")
410 .with_description("State in the Pacific Northwest")
411 .with_entity_type("administrative region")
412 .with_difficulty(0.5),
413 );
414 self.add(washington);
415
416 self
417 }
418
419 pub fn generate_all_training_pairs(&self) -> Vec<TrainingPair> {
424 let mut pairs = Vec::new();
425 for set in self.entries.values() {
426 for positive in &set.entities {
427 pairs.extend(set.to_training_pairs(&positive.kb_id));
428 }
429 }
430 pairs
431 }
432
433 pub fn filter_by_type(&self, entity_type: &str) -> Vec<&ConfusableEntity> {
435 self.entries
436 .values()
437 .flat_map(|set| &set.entities)
438 .filter(|e| {
439 e.entity_type
440 .as_ref()
441 .is_some_and(|t| t.to_lowercase() == entity_type.to_lowercase())
442 })
443 .collect()
444 }
445}
446
447#[derive(Debug, Clone, Default)]
453pub struct OcrConfusables {
454 substitutions: HashMap<char, Vec<char>>,
456}
457
458impl OcrConfusables {
459 pub fn new() -> Self {
461 let mut subs = HashMap::new();
462
463 subs.insert('m', vec!['r', 'n']); subs.insert('l', vec!['1', 'I', '|']);
466 subs.insert('O', vec!['0', 'Q']);
467 subs.insert('I', vec!['l', '1', '|']);
468 subs.insert('S', vec!['5', '$']);
469 subs.insert('B', vec!['8', '3']);
470 subs.insert('G', vec!['6', 'C']);
471 subs.insert('Z', vec!['2']);
472 subs.insert('o', vec!['0', 'c']);
473 subs.insert('c', vec!['o', 'e']);
474 subs.insert('e', vec!['c', 'o']);
475 subs.insert('h', vec!['b', 'n']);
476 subs.insert('u', vec!['v', 'n']);
477 subs.insert('v', vec!['u', 'w']);
478 subs.insert('w', ["vv", "uu"].iter().flat_map(|s| s.chars()).collect());
479
480 Self {
481 substitutions: subs,
482 }
483 }
484
485 pub fn generate_variants(&self, text: &str) -> Vec<String> {
489 let mut variants = Vec::new();
490 let chars: Vec<char> = text.chars().collect();
491
492 for (i, c) in chars.iter().enumerate() {
494 if let Some(subs) = self.substitutions.get(c) {
495 for sub in subs {
496 let mut variant: String = chars[..i].iter().collect();
497 variant.push(*sub);
498 variant.extend(&chars[i + 1..]);
499 variants.push(variant);
500 }
501 }
502 }
503
504 let text_lower = text.to_lowercase();
506 if text_lower.contains("rn") {
507 variants.push(text.replace("rn", "m"));
508 }
509 if text_lower.contains("m") {
510 variants.push(text.replace('m', "rn"));
511 }
512
513 variants
514 }
515
516 pub fn might_be_variants(&self, a: &str, b: &str) -> bool {
518 let a_variants = self.generate_variants(a);
519 if a_variants.iter().any(|v| v.eq_ignore_ascii_case(b)) {
520 return true;
521 }
522 let b_variants = self.generate_variants(b);
523 b_variants.iter().any(|v| v.eq_ignore_ascii_case(a))
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 fn test_confusable_set() {
533 let mut set = ConfusableSet::new("John Smith");
534 set.add("Q1", ConfusableReason::DisambiguationPage, Some("Explorer"));
535 set.add(
536 "Q2",
537 ConfusableReason::DisambiguationPage,
538 Some("Politician"),
539 );
540
541 assert_eq!(set.len(), 2);
542
543 let pairs = set.to_training_pairs("Q1");
544 assert_eq!(pairs.len(), 1);
545 assert_eq!(pairs[0].negative_kb_id, "Q2");
546 }
547
548 #[test]
549 fn test_registry() {
550 let registry = ConfusableRegistry::new().with_well_known();
551
552 let bush = registry.get("george bush");
554 assert!(bush.is_some());
555 let bush = bush.unwrap();
556 assert!(bush.len() >= 2);
557
558 let pairs = registry.generate_all_training_pairs();
560 assert!(!pairs.is_empty());
561 }
562
563 #[test]
564 fn test_temporal_ambiguity() {
565 let registry = ConfusableRegistry::new().with_well_known();
566
567 let bush = registry.get("president bush").unwrap();
568 let temporal = bush.filter_by_reason(&ConfusableReason::TemporalAmbiguity);
569 assert!(temporal.len() >= 2);
570 }
571
572 #[test]
573 fn test_ocr_confusables() {
574 let ocr = OcrConfusables::new();
575
576 let variants = ocr.generate_variants("Gnome");
578 assert!(variants.contains(&"Gnorne".to_string()));
579
580 assert!(ocr.might_be_variants("Gnome", "Gnorne"));
581 }
582
583 #[test]
584 fn test_transliteration() {
585 let registry = ConfusableRegistry::new().with_well_known();
586
587 let beijing = registry.get("beijing");
589 let peking = registry.get("peking");
590
591 assert!(beijing.is_some());
592 assert!(peking.is_some());
593
594 let beijing_ids: Vec<_> = beijing.unwrap().entities.iter().map(|e| &e.kb_id).collect();
595 let peking_ids: Vec<_> = peking.unwrap().entities.iter().map(|e| &e.kb_id).collect();
596
597 assert!(beijing_ids.contains(&&"Q956".to_string()));
598 assert!(peking_ids.contains(&&"Q956".to_string()));
599 }
600}