1use crate::edit_distance;
22use serde::{Deserialize, Serialize};
23use std::collections::HashMap;
24
25#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
27pub enum CandidateSource {
28 #[default]
30 Wikidata,
31 YAGO,
33 DBpedia,
35 Wikipedia,
37 Freebase,
39 UMLS,
41 GeoNames,
43 Custom(String),
45}
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
68pub enum SimilarityMetric {
69 #[default]
74 Jaccard,
75
76 EditDistance,
81
82 DamerauLevenshtein,
88
89 EditDistanceWildcard,
96}
97
98impl SimilarityMetric {
99 #[must_use]
103 pub fn compute(&self, a: &str, b: &str) -> f64 {
104 match self {
105 SimilarityMetric::Jaccard => string_similarity(a, b),
106 SimilarityMetric::EditDistance => edit_distance::edit_similarity(a, b),
107 SimilarityMetric::DamerauLevenshtein => {
108 let dist = edit_distance::damerau_levenshtein(a, b);
110 let max_len = a.chars().count().max(b.chars().count());
111 if max_len == 0 {
112 1.0
113 } else {
114 1.0 - (dist as f64 / max_len as f64)
115 }
116 }
117 SimilarityMetric::EditDistanceWildcard => {
118 edit_distance::edit_similarity_wildcards(a, b)
119 }
120 }
121 }
122
123 #[must_use]
125 pub fn name(&self) -> &'static str {
126 match self {
127 SimilarityMetric::Jaccard => "jaccard",
128 SimilarityMetric::EditDistance => "edit-distance",
129 SimilarityMetric::DamerauLevenshtein => "damerau-levenshtein",
130 SimilarityMetric::EditDistanceWildcard => "edit-distance-wildcard",
131 }
132 }
133
134 pub fn parse_str(s: &str) -> Option<Self> {
138 match s.to_lowercase().as_str() {
139 "jaccard" | "jac" => Some(SimilarityMetric::Jaccard),
140 "edit-distance" | "edit" | "levenshtein" | "lev" => {
141 Some(SimilarityMetric::EditDistance)
142 }
143 "damerau-levenshtein" | "damerau" | "dl" => Some(SimilarityMetric::DamerauLevenshtein),
144 "edit-distance-wildcard" | "wildcard" | "edw" => {
145 Some(SimilarityMetric::EditDistanceWildcard)
146 }
147 _ => None,
148 }
149 }
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct Candidate {
155 pub kb_id: String,
157 pub source: CandidateSource,
159 pub label: String,
161 pub aliases: Vec<String>,
163 pub description: Option<String>,
165 pub kb_type: Option<String>,
167 pub sitelinks: Option<u32>,
169 pub prior: f64,
171 pub string_sim: f64,
173 pub type_score: f64,
175 pub score: f64,
177 pub valid_from: Option<String>,
184 pub valid_until: Option<String>,
188}
189
190impl Candidate {
191 pub fn new(kb_id: &str, source: CandidateSource, label: &str) -> Self {
193 Self {
194 kb_id: kb_id.to_string(),
195 source,
196 label: label.to_string(),
197 aliases: Vec::new(),
198 description: None,
199 kb_type: None,
200 sitelinks: None,
201 prior: 0.0,
202 string_sim: 0.0,
203 type_score: 1.0,
204 score: 0.0,
205 valid_from: None,
206 valid_until: None,
207 }
208 }
209
210 pub fn with_valid_from(mut self, date: &str) -> Self {
212 self.valid_from = Some(date.to_string());
213 self
214 }
215
216 pub fn with_valid_until(mut self, date: &str) -> Self {
218 self.valid_until = Some(date.to_string());
219 self
220 }
221
222 pub fn with_alias(mut self, alias: &str) -> Self {
224 self.aliases.push(alias.to_string());
225 self
226 }
227
228 pub fn with_description(mut self, desc: &str) -> Self {
230 self.description = Some(desc.to_string());
231 self
232 }
233
234 pub fn with_kb_type(mut self, kb_type: &str) -> Self {
236 self.kb_type = Some(kb_type.to_string());
237 self
238 }
239
240 pub fn with_prior(mut self, prior: f64) -> Self {
242 self.prior = prior;
243 self
244 }
245
246 pub fn compute_score(&mut self) {
248 self.score = 0.4 * self.string_sim
250 + 0.3 * self.prior
251 + 0.2 * self.type_score
252 + 0.1
253 * self
254 .sitelinks
255 .map(|s| (s as f64).log10() / 7.0)
256 .unwrap_or(0.0);
257 }
258
259 pub fn compute_score_with_temporal(&mut self, document_date: Option<&str>) {
268 self.compute_score();
270
271 if let Some(doc_date) = document_date {
273 let temporal_score = self.temporal_compatibility(doc_date);
274 self.score *= 0.5 + 0.5 * temporal_score;
276 }
277 }
278
279 pub fn temporal_compatibility(&self, document_date: &str) -> f64 {
284 let doc_year = parse_year(document_date);
286
287 let from_year = self.valid_from.as_deref().and_then(parse_year);
289 let until_year = self.valid_until.as_deref().and_then(parse_year);
290
291 match (from_year, until_year, doc_year) {
292 (None, None, _) | (_, _, None) => 1.0,
294
295 (Some(from), _, Some(doc)) if doc < from => {
297 let years_before = from - doc;
299 (1.0 - years_before as f64 / 20.0).max(0.1)
300 }
301
302 (_, Some(until), Some(doc)) if doc > until => {
304 let years_after = doc - until;
306 (1.0 - years_after as f64 / 20.0).max(0.1)
307 }
308
309 _ => 1.0,
311 }
312 }
313
314 pub fn to_iri(&self) -> String {
316 match &self.source {
317 CandidateSource::Wikidata => {
318 format!("http://www.wikidata.org/entity/{}", self.kb_id)
319 }
320 CandidateSource::YAGO => {
321 format!("http://yago-knowledge.org/resource/{}", self.kb_id)
322 }
323 CandidateSource::DBpedia => {
324 format!("http://dbpedia.org/resource/{}", self.kb_id)
325 }
326 CandidateSource::Wikipedia => {
327 format!("https://en.wikipedia.org/wiki/{}", self.kb_id)
328 }
329 CandidateSource::Freebase => {
330 format!("http://rdf.freebase.com/ns/{}", self.kb_id)
331 }
332 CandidateSource::UMLS => {
333 format!("https://uts.nlm.nih.gov/uts/umls/concept/{}", self.kb_id)
334 }
335 CandidateSource::GeoNames => {
336 format!("https://sws.geonames.org/{}/", self.kb_id)
337 }
338 CandidateSource::Custom(name) => {
339 format!("{}:{}", name, self.kb_id)
340 }
341 }
342 }
343
344 pub fn to_curie(&self) -> String {
346 let prefix = match &self.source {
347 CandidateSource::Wikidata => "wd",
348 CandidateSource::YAGO => "yago",
349 CandidateSource::DBpedia => "dbr",
350 CandidateSource::Wikipedia => "wp",
351 CandidateSource::Freebase => "fb",
352 CandidateSource::UMLS => "umls",
353 CandidateSource::GeoNames => "gn",
354 CandidateSource::Custom(name) => name,
355 };
356 format!("{}:{}", prefix, self.kb_id)
357 }
358}
359
360pub trait CandidateGenerator: Send + Sync {
362 fn generate(
370 &self,
371 mention: &str,
372 context: &str,
373 entity_type: Option<&str>,
374 limit: usize,
375 ) -> Vec<Candidate>;
376
377 fn name(&self) -> &'static str;
379}
380
381#[derive(Debug, Clone, Default)]
402pub struct DictionaryCandidateGenerator {
403 entries: HashMap<String, Vec<Candidate>>,
405 metric: SimilarityMetric,
407}
408
409impl DictionaryCandidateGenerator {
410 pub fn new() -> Self {
412 Self::default()
413 }
414
415 pub fn with_metric(mut self, metric: SimilarityMetric) -> Self {
427 self.metric = metric;
428 self
429 }
430
431 #[must_use]
433 pub fn metric(&self) -> SimilarityMetric {
434 self.metric
435 }
436
437 pub fn add_entry(&mut self, surface: &str, candidate: Candidate) {
439 let normalized = surface.to_lowercase();
440 self.entries.entry(normalized).or_default().push(candidate);
441 }
442
443 pub fn with_well_known(mut self) -> Self {
448 let well_known = [
449 ("albert einstein", "Q937", "theoretical physicist"),
451 ("marie curie", "Q7186", "physicist and chemist"),
452 (
453 "tu youyou",
454 "Q546079",
455 "Chinese pharmacologist, Nobel laureate",
456 ),
457 ("屠呦呦", "Q546079", "Chinese pharmacologist"),
458 ("c.v. raman", "Q201010", "Indian physicist, Nobel laureate"),
459 (
460 "abdus salam",
461 "Q108365",
462 "Pakistani physicist, Nobel laureate",
463 ),
464 (
465 "wangari maathai",
466 "Q180728",
467 "Kenyan environmentalist, Nobel laureate",
468 ),
469 ("barack obama", "Q76", "44th President of the United States"),
471 ("angela merkel", "Q567", "Chancellor of Germany"),
472 ("習近平", "Q15031", "General Secretary of CCP"),
473 ("xi jinping", "Q15031", "General Secretary of CCP"),
474 ("narendra modi", "Q1058", "Prime Minister of India"),
475 ("नरेन्द्र मोदी", "Q1058", "Prime Minister of India"),
476 ("محمد بن سلمان", "Q6889872", "Crown Prince of Saudi Arabia"),
477 (
478 "mohammed bin salman",
479 "Q6889872",
480 "Crown Prince of Saudi Arabia",
481 ),
482 ("cyril ramaphosa", "Q312910", "President of South Africa"),
483 ("lula da silva", "Q37181", "President of Brazil"),
484 ("google", "Q95", "American technology company"),
486 ("apple", "Q312", "American technology company"),
487 ("microsoft", "Q2283", "American technology company"),
488 ("alibaba", "Q306717", "Chinese technology company"),
489 ("阿里巴巴", "Q306717", "Chinese technology company"),
490 ("tencent", "Q860580", "Chinese technology company"),
491 ("腾讯", "Q860580", "Chinese technology company"),
492 ("samsung", "Q20718", "South Korean conglomerate"),
493 ("삼성", "Q20718", "South Korean conglomerate"),
494 ("tata", "Q752289", "Indian conglomerate"),
495 ("infosys", "Q723418", "Indian technology company"),
496 ("new york", "Q60", "city in New York State"),
498 ("london", "Q84", "capital of the United Kingdom"),
499 ("paris", "Q90", "capital of France"),
500 ("berlin", "Q64", "capital of Germany"),
501 ("tokyo", "Q1490", "capital of Japan"),
502 ("東京", "Q1490", "capital of Japan"),
503 ("beijing", "Q956", "capital of China"),
504 ("北京", "Q956", "capital of China"),
505 ("mumbai", "Q1156", "financial capital of India"),
506 ("मुंबई", "Q1156", "financial capital of India"),
507 ("cairo", "Q85", "capital of Egypt"),
508 ("القاهرة", "Q85", "capital of Egypt"),
509 ("são paulo", "Q174", "largest city in Brazil"),
510 ("lagos", "Q8673", "largest city in Nigeria"),
511 ("москва", "Q649", "capital of Russia"),
512 ("moscow", "Q649", "capital of Russia"),
513 ("dubai", "Q612", "city in UAE"),
514 ("دبي", "Q612", "city in UAE"),
515 ("singapore", "Q334", "city-state in Southeast Asia"),
516 ("新加坡", "Q334", "city-state in Southeast Asia"),
517 ("united nations", "Q1065", "international organization"),
519 ("european union", "Q458", "political and economic union"),
520 (
521 "world health organization",
522 "Q7817",
523 "UN specialized agency",
524 ),
525 ("who", "Q7817", "World Health Organization"),
526 ("nato", "Q7184", "North Atlantic Treaty Organization"),
527 (
528 "african union",
529 "Q7159",
530 "continental union of African states",
531 ),
532 ("asean", "Q7768", "Association of Southeast Asian Nations"),
533 (
534 "opec",
535 "Q7795",
536 "Organization of the Petroleum Exporting Countries",
537 ),
538 ("confucius", "Q4604", "Chinese philosopher"),
540 ("孔子", "Q4604", "Chinese philosopher"),
541 ("mahatma gandhi", "Q1001", "Indian independence leader"),
542 ("महात्मा गांधी", "Q1001", "Indian independence leader"),
543 (
544 "nelson mandela",
545 "Q8023",
546 "South African anti-apartheid leader",
547 ),
548 ("cleopatra", "Q635", "last Pharaoh of Egypt"),
549 ("genghis khan", "Q720", "founder of Mongol Empire"),
550 ("成吉思汗", "Q720", "founder of Mongol Empire"),
551 ("pelé", "Q12897", "Brazilian footballer"),
553 ("shakira", "Q34424", "Colombian singer"),
554 ("bts", "Q485927", "South Korean boy band"),
555 ("방탄소년단", "Q485927", "South Korean boy band"),
556 ("宮崎駿", "Q55400", "Japanese animator"),
557 ("hayao miyazaki", "Q55400", "Japanese animator"),
558 ];
559
560 for (surface, qid, desc) in well_known {
561 let candidate = Candidate::new(qid, CandidateSource::Wikidata, surface)
562 .with_description(desc)
563 .with_prior(0.5);
564 self.add_entry(surface, candidate);
565 }
566
567 self
568 }
569}
570
571impl CandidateGenerator for DictionaryCandidateGenerator {
572 fn generate(
573 &self,
574 mention: &str,
575 _context: &str,
576 _entity_type: Option<&str>,
577 limit: usize,
578 ) -> Vec<Candidate> {
579 let normalized = mention.to_lowercase();
580
581 if !mention.contains('?') && !mention.contains('*') {
583 if let Some(candidates) = self.entries.get(&normalized) {
584 return candidates.iter().take(limit).cloned().collect();
585 }
586 }
587
588 let mut results: Vec<Candidate> =
590 self.entries.iter().flat_map(|(_, v)| v.clone()).collect();
591
592 for c in &mut results {
594 c.string_sim = self.metric.compute(mention, &c.label);
595 c.compute_score();
596 }
597
598 results.retain(|c| c.string_sim > 0.1);
600
601 results.sort_by(|a, b| {
602 b.score
603 .partial_cmp(&a.score)
604 .unwrap_or(std::cmp::Ordering::Equal)
605 });
606 results.truncate(limit);
607
608 results
609 }
610
611 fn name(&self) -> &'static str {
612 "dictionary"
613 }
614}
615
616pub fn string_similarity(a: &str, b: &str) -> f64 {
618 #[cfg(feature = "gramdex")]
624 {
625 string_similarity_gramdex(a, b)
626 }
627 #[cfg(not(feature = "gramdex"))]
628 {
629 string_similarity_textprep(a, b)
630 }
631}
632
633#[must_use]
635pub fn string_similarity_textprep(a: &str, b: &str) -> f64 {
636 textprep::similarity::weighted_word_char_ngram_jaccard(a, b, 3, 0.6, 0.4)
637}
638
639#[cfg(feature = "gramdex")]
641#[must_use]
642pub fn string_similarity_gramdex(a: &str, b: &str) -> f64 {
643 let word = textprep::similarity::word_jaccard(a, b);
649 let tri = gramdex::trigram_jaccard(a, b) as f64;
650 0.6 * word + 0.4 * tri
651}
652
653fn parse_year(date: &str) -> Option<i32> {
660 let trimmed = date.trim();
661 if trimmed.is_empty() {
662 return None;
663 }
664
665 let (sign, rest) = if let Some(rest) = trimmed.strip_prefix('-') {
667 (-1, rest)
668 } else {
669 (1, trimmed)
670 };
671
672 let year_str = rest.split('-').next()?;
674 let year: i32 = year_str.parse().ok()?;
675 Some(sign * year)
676}
677
678pub fn type_compatibility(ner_type: Option<&str>, kb_type: Option<&str>) -> f64 {
680 match (ner_type, kb_type) {
681 (None, _) | (_, None) => 1.0, (Some(n), Some(k)) => {
683 let n_lower = n.to_lowercase();
684 let k_lower = k.to_lowercase();
685
686 if n_lower == k_lower {
688 return 1.0;
689 }
690
691 if (n_lower.contains("person") || n_lower == "per")
693 && (k_lower.contains("human") || k_lower.contains("person"))
694 {
695 return 0.95;
696 }
697
698 if (n_lower.contains("org") || n_lower == "organization")
700 && (k_lower.contains("organization")
701 || k_lower.contains("company")
702 || k_lower.contains("institution"))
703 {
704 return 0.9;
705 }
706
707 if (n_lower.contains("loc") || n_lower.contains("gpe") || n_lower == "location")
709 && (k_lower.contains("city")
710 || k_lower.contains("country")
711 || k_lower.contains("place")
712 || k_lower.contains("location"))
713 {
714 return 0.9;
715 }
716
717 0.3
719 }
720 }
721}
722
723#[cfg(test)]
724mod tests {
725 use super::*;
726
727 #[test]
728 fn test_dictionary_generator() {
729 let gen = DictionaryCandidateGenerator::new().with_well_known();
730
731 let candidates = gen.generate("albert einstein", "", None, 5);
733 assert!(!candidates.is_empty());
734 assert!(candidates[0].kb_id == "Q937");
735
736 let partial = gen.generate("Einstein", "", None, 5);
739 let _ = partial; }
742
743 #[test]
744 fn test_string_similarity() {
745 assert!(string_similarity_textprep("Albert Einstein", "Einstein") > 0.3);
747 assert!(string_similarity_textprep("Albert Einstein", "Albert Einstein") > 0.99);
748 assert!(string_similarity_textprep("New York", "New York City") > 0.5);
749 }
750
751 #[cfg(feature = "gramdex")]
752 #[test]
753 fn test_string_similarity_gramdex_bounds() {
754 let sim = string_similarity_gramdex("Albert Einstein", "Einstein");
755 assert!((0.0..=1.0).contains(&sim));
756 }
757
758 #[test]
759 fn test_type_compatibility() {
760 assert!(type_compatibility(Some("PERSON"), Some("human")) > 0.9);
761 assert!(type_compatibility(Some("ORG"), Some("company")) > 0.8);
762 assert!(type_compatibility(Some("PERSON"), Some("city")) < 0.5);
763 }
764
765 #[test]
766 fn test_candidate_iri() {
767 let c = Candidate::new("Q937", CandidateSource::Wikidata, "Einstein");
768 assert_eq!(c.to_iri(), "http://www.wikidata.org/entity/Q937");
769 }
770
771 #[test]
772 fn test_parse_year() {
773 assert_eq!(parse_year("1990-01-15"), Some(1990));
774 assert_eq!(parse_year("1990"), Some(1990));
775 assert_eq!(parse_year("-0044"), Some(-44)); assert_eq!(parse_year(""), None);
777 }
778
779 #[test]
780 fn test_temporal_compatibility() {
781 let bush_sr = Candidate::new("Q23505", CandidateSource::Wikidata, "George H. W. Bush")
783 .with_valid_from("1924-06-12")
784 .with_valid_until("2018-11-30");
785
786 let bush_jr = Candidate::new("Q207", CandidateSource::Wikidata, "George W. Bush")
788 .with_valid_from("1946-07-06");
789
790 assert!(bush_sr.temporal_compatibility("1990-01-01") > 0.9);
792 assert!(bush_jr.temporal_compatibility("1990-01-01") > 0.9);
793
794 let sr_compat_2020 = bush_sr.temporal_compatibility("2020-01-01");
796 assert!(sr_compat_2020 < 1.0);
797 assert!(sr_compat_2020 > 0.5);
798
799 assert!(bush_jr.temporal_compatibility("2020-01-01") > 0.9);
801 }
802
803 #[test]
804 fn test_compute_score_with_temporal() {
805 let mut caesar = Candidate::new("Q1048", CandidateSource::Wikidata, "Julius Caesar")
807 .with_valid_from("-0100-07-12")
808 .with_valid_until("-0044-03-15")
809 .with_prior(0.9);
810 caesar.string_sim = 0.9;
811
812 caesar.compute_score();
814 let base_score = caesar.score;
815
816 caesar.compute_score_with_temporal(Some("-0050-01-01"));
818 let ancient_score = caesar.score;
819
820 caesar.compute_score_with_temporal(Some("2024-01-01"));
822 let modern_score = caesar.score;
823
824 assert!(ancient_score > modern_score);
826 assert!(ancient_score <= base_score || (ancient_score - base_score).abs() < 0.01);
828 }
829
830 #[test]
835 fn test_similarity_metric_jaccard() {
836 let metric = SimilarityMetric::Jaccard;
837 assert!(metric.compute("hello world", "hello world") > 0.99);
838 assert!(metric.compute("hello world", "hello") > 0.3);
839 }
840
841 #[test]
842 fn test_similarity_metric_edit_distance() {
843 let metric = SimilarityMetric::EditDistance;
844 assert!(metric.compute("Einstein", "Einstein") > 0.99);
845 assert!(metric.compute("Einstein", "Einstien") > 0.7); assert!(metric.compute("Einstein", "Newton") < 0.5);
847 }
848
849 #[test]
850 fn test_similarity_metric_damerau() {
851 let metric = SimilarityMetric::DamerauLevenshtein;
852 assert!(metric.compute("teh", "the") > 0.6);
854 assert!(metric.compute("recieve", "receive") > 0.8);
855 }
856
857 #[test]
858 fn test_similarity_metric_wildcard() {
859 let metric = SimilarityMetric::EditDistanceWildcard;
860
861 assert!(metric.compute("R?ma", "Roma") > 0.99);
863 assert!(metric.compute("Ein*", "Einstein") > 0.99);
864 assert!(metric.compute("*stein", "Einstein") > 0.99);
865
866 assert!(metric.compute("???TOR", "CASTOR") > 0.99);
868 }
869
870 #[test]
871 fn test_similarity_metric_from_str() {
872 assert_eq!(
873 SimilarityMetric::parse_str("jaccard"),
874 Some(SimilarityMetric::Jaccard)
875 );
876 assert_eq!(
877 SimilarityMetric::parse_str("edit-distance"),
878 Some(SimilarityMetric::EditDistance)
879 );
880 assert_eq!(
881 SimilarityMetric::parse_str("lev"),
882 Some(SimilarityMetric::EditDistance)
883 );
884 assert_eq!(
885 SimilarityMetric::parse_str("wildcard"),
886 Some(SimilarityMetric::EditDistanceWildcard)
887 );
888 assert_eq!(SimilarityMetric::parse_str("unknown"), None);
889 }
890
891 #[test]
892 fn test_generator_with_edit_distance() {
893 let gen = DictionaryCandidateGenerator::new()
894 .with_metric(SimilarityMetric::EditDistance)
895 .with_well_known();
896
897 let candidates = gen.generate("Albert Einstein", "", None, 5);
899 assert!(!candidates.is_empty());
900 assert!(candidates
901 .iter()
902 .any(|c| c.label.to_lowercase().contains("einstein")));
903
904 let typo_candidates = gen.generate("Einstien", "", None, 5);
907 let _ = typo_candidates;
909 }
910
911 #[test]
912 fn test_generator_with_wildcard() {
913 let gen = DictionaryCandidateGenerator::new()
914 .with_metric(SimilarityMetric::EditDistanceWildcard)
915 .with_well_known();
916
917 let candidates = gen.generate("marie c*", "", None, 10);
919 assert!(!candidates.is_empty());
920 assert!(candidates
921 .iter()
922 .any(|c| c.label.to_lowercase().contains("curie")));
923 }
924
925 #[test]
926 fn test_similarity_metric_cjk() {
927 let metric = SimilarityMetric::EditDistance;
929
930 let sim = metric.compute("北京", "北平");
932 assert!(sim > 0.4 && sim < 0.9, "CJK similarity: {}", sim);
933
934 assert!(metric.compute("東京", "東京") > 0.99);
936 }
937}
938
939#[cfg(test)]
944mod proptests {
945 use super::*;
946 use proptest::prelude::*;
947
948 fn arb_short_string() -> impl Strategy<Value = String> {
950 prop::string::string_regex("[a-zA-Z0-9 ]{0,30}").unwrap()
951 }
952
953 fn arb_entity_name() -> impl Strategy<Value = String> {
955 prop::string::string_regex("[A-Z][a-z]+ [A-Z][a-z]+")
956 .unwrap()
957 .prop_filter("non-empty", |s| !s.is_empty())
958 }
959
960 fn arb_metric() -> impl Strategy<Value = SimilarityMetric> {
962 prop_oneof![
963 Just(SimilarityMetric::Jaccard),
964 Just(SimilarityMetric::EditDistance),
965 Just(SimilarityMetric::DamerauLevenshtein),
966 Just(SimilarityMetric::EditDistanceWildcard),
967 ]
968 }
969
970 proptest! {
975 #[test]
977 fn prop_metric_bounds(metric in arb_metric(), a in arb_short_string(), b in arb_short_string()) {
978 let sim = metric.compute(&a, &b);
979 prop_assert!(
980 (0.0..=1.0).contains(&sim),
981 "Similarity {} out of [0,1] for {:?}",
982 sim,
983 metric
984 );
985 }
986
987 #[test]
989 fn prop_metric_identity(metric in arb_metric(), s in arb_short_string()) {
990 let sim = metric.compute(&s, &s);
991 prop_assert!(
992 (sim - 1.0).abs() < 1e-10,
993 "Identity similarity should be 1.0, got {} for {:?}", sim, metric
994 );
995 }
996
997 #[test]
1000 fn prop_symmetric_metrics_symmetric(a in arb_short_string(), b in arb_short_string()) {
1001 for metric in [
1002 SimilarityMetric::Jaccard,
1003 SimilarityMetric::EditDistance,
1004 SimilarityMetric::DamerauLevenshtein,
1005 ] {
1006 let sim1 = metric.compute(&a, &b);
1007 let sim2 = metric.compute(&b, &a);
1008 prop_assert!(
1009 (sim1 - sim2).abs() < 1e-10,
1010 "{:?} not symmetric: ({},{})={} vs ({},{})={}", metric, a, b, sim1, b, a, sim2
1011 );
1012 }
1013 }
1014
1015 #[test]
1017 fn prop_metric_name_roundtrip(metric in arb_metric()) {
1018 let name = metric.name();
1019 if let Some(recovered) = SimilarityMetric::parse_str(name) {
1020 prop_assert_eq!(metric, recovered);
1021 }
1022 }
1024
1025 #[test]
1027 fn prop_metric_empty_identity(metric in arb_metric()) {
1028 let sim = metric.compute("", "");
1029 prop_assert!(
1030 (sim - 1.0).abs() < 1e-10,
1031 "Empty string identity should be 1.0, got {} for {:?}", sim, metric
1032 );
1033 }
1034 }
1035
1036 proptest! {
1041 #[test]
1043 fn prop_candidate_score_bounds(
1044 kb_id in "[A-Z][0-9]+",
1045 label in arb_entity_name(),
1046 string_sim in 0.0f64..1.0,
1047 prior in 0.0f64..1.0
1048 ) {
1049 let mut candidate = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1050 candidate.string_sim = string_sim;
1051 candidate.prior = prior;
1052 candidate.compute_score();
1053
1054 prop_assert!(
1055 candidate.score >= 0.0 && candidate.score <= 1.0,
1056 "Score {} out of [0,1]", candidate.score
1057 );
1058 }
1059
1060 #[test]
1062 fn prop_candidate_kb_id_deterministic(
1063 kb_id in "[A-Z][0-9]+",
1064 label in arb_entity_name()
1065 ) {
1066 let c1 = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1067 let c2 = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1068 prop_assert_eq!(c1.kb_id, c2.kb_id);
1069 }
1070
1071 #[test]
1073 fn prop_candidate_serde_roundtrip(
1074 kb_id in "[A-Z][0-9]+",
1075 label in arb_entity_name()
1076 ) {
1077 let candidate = Candidate::new(&kb_id, CandidateSource::Wikidata, &label);
1078 let json = serde_json::to_string(&candidate).unwrap();
1079 let recovered: Candidate = serde_json::from_str(&json).unwrap();
1080
1081 prop_assert_eq!(candidate.kb_id, recovered.kb_id);
1082 prop_assert_eq!(candidate.label, recovered.label);
1083 }
1084 }
1085
1086 proptest! {
1091 #[test]
1093 fn prop_generator_respects_limit(
1094 mention in arb_entity_name(),
1095 limit in 1usize..20
1096 ) {
1097 let gen = DictionaryCandidateGenerator::new().with_well_known();
1098 let candidates = gen.generate(&mention, "", None, limit);
1099 prop_assert!(
1100 candidates.len() <= limit,
1101 "Got {} candidates but limit was {}", candidates.len(), limit
1102 );
1103 }
1104
1105 #[test]
1107 fn prop_generator_name_consistent(metric in arb_metric()) {
1108 let gen = DictionaryCandidateGenerator::new().with_metric(metric);
1109 let name = gen.name();
1110 prop_assert!(!name.is_empty());
1111 }
1112
1113 #[test]
1115 fn prop_generator_metric_set(metric in arb_metric()) {
1116 let gen = DictionaryCandidateGenerator::new().with_metric(metric);
1117 prop_assert_eq!(gen.metric(), metric);
1118 }
1119
1120 #[test]
1122 fn prop_candidates_sorted_descending(mention in arb_entity_name()) {
1123 let gen = DictionaryCandidateGenerator::new().with_well_known();
1124 let candidates = gen.generate(&mention, "", None, 10);
1125
1126 for i in 1..candidates.len() {
1127 prop_assert!(
1128 candidates[i-1].score >= candidates[i].score,
1129 "Candidates not sorted: {} < {} at positions {}-{}",
1130 candidates[i-1].score, candidates[i].score, i-1, i
1131 );
1132 }
1133 }
1134 }
1135
1136 proptest! {
1141 #[test]
1143 fn prop_source_serde_roundtrip(source in prop_oneof![
1144 Just(CandidateSource::Wikidata),
1145 Just(CandidateSource::YAGO),
1146 Just(CandidateSource::DBpedia),
1147 Just(CandidateSource::Wikipedia),
1148 Just(CandidateSource::Freebase),
1149 Just(CandidateSource::UMLS),
1150 Just(CandidateSource::GeoNames),
1151 ]) {
1152 let json = serde_json::to_string(&source).unwrap();
1153 let recovered: CandidateSource = serde_json::from_str(&json).unwrap();
1154 prop_assert_eq!(source, recovered);
1155 }
1156
1157 #[test]
1159 fn prop_custom_source_roundtrip(name in "[a-z]+") {
1160 let source = CandidateSource::Custom(name.clone());
1161 let json = serde_json::to_string(&source).unwrap();
1162 let recovered: CandidateSource = serde_json::from_str(&json).unwrap();
1163
1164 if let CandidateSource::Custom(n) = recovered {
1165 prop_assert_eq!(name, n);
1166 } else {
1167 prop_assert!(false, "Expected Custom variant");
1168 }
1169 }
1170 }
1171}