1use crate::regex::Regex;
135use crate::{Citation, DuplicateGroup};
136use std::collections::HashMap;
137use std::sync::LazyLock;
138use strsim::jaro;
139use strsim::jaro_winkler;
140
141const DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.85;
142const NO_DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.93;
143
144static UNICODE_REGEX: LazyLock<Regex> =
145 LazyLock::new(|| Regex::new(r"<U\+([0-9A-Fa-f]+)>").unwrap());
146
147const HTML_REPLACEMENTS: [(&str, &str); 13] = [
148 ("<", "<"),
149 (">", ">"),
150 ("<sup>", ""),
151 ("</sup>", ""),
152 ("<sub>", ""),
153 ("</sub>", ""),
154 ("<inf>", ""),
155 ("</inf>", ""),
156 ("beta", "b"),
157 ("alpha", "a"),
158 ("α", "a"),
159 ("ß", "b"),
160 ("γ", "g"),
161];
162
163#[derive(Debug, Default, Clone)]
191pub struct DeduplicatorConfig {
192 pub group_by_year: bool,
195 pub run_in_parallel: bool,
198 pub source_preferences: Vec<String>,
201}
202
203#[derive(Debug, Default, Clone)]
244pub struct Deduplicator {
245 config: DeduplicatorConfig,
246}
247
248#[derive(Debug)]
249struct PreprocessedCitation<'a> {
250 original: &'a Citation,
251 normalized_title: String,
252 normalized_journal: Option<String>,
253 normalized_journal_abbr: Option<String>,
254 normalized_issn: Vec<String>,
255 normalized_volume: String,
256}
257
258#[derive(Debug, thiserror::Error)]
260pub enum DedupeError {
261 #[error("Invalid citation data: {0}")]
262 InvalidCitation(String),
263
264 #[error("Processing error: {0}")]
265 ProcessingError(String),
266
267 #[error("Configuration error: {0}")]
268 ConfigError(String),
269}
270
271impl Deduplicator {
272 #[must_use]
284 pub fn new() -> Self {
285 Self {
286 config: DeduplicatorConfig {
287 group_by_year: true,
288 run_in_parallel: false,
289 source_preferences: Vec::new(),
290 },
291 }
292 }
293
294 #[must_use]
315 pub fn with_config(mut self, mut config: DeduplicatorConfig) -> Self {
316 if !config.group_by_year {
318 config.run_in_parallel = false;
319 }
320 self.config = config;
321 self
322 }
323
324 pub fn find_duplicates(
357 self,
358 citations: &[Citation],
359 ) -> Result<Vec<DuplicateGroup>, DedupeError> {
360 self.find_duplicates_with_sources(citations, &[])
361 }
362
363 pub fn find_duplicates_with_sources(
404 self,
405 citations: &[Citation],
406 sources: &[&str],
407 ) -> Result<Vec<DuplicateGroup>, DedupeError> {
408 if citations.is_empty() {
409 return Ok(Vec::new());
410 }
411
412 if sources.len() > citations.len() {
414 return Err(DedupeError::ConfigError(format!(
415 "Number of sources ({}) exceeds number of citations ({}). Each source must correspond to a citation.",
416 sources.len(),
417 citations.len()
418 )));
419 }
420
421 let source_map: HashMap<usize, Option<&str>> = citations
423 .iter()
424 .enumerate()
425 .zip(
426 sources
427 .iter()
428 .map(|&s| Some(s))
429 .chain(std::iter::repeat(None)),
430 )
431 .map(|((idx, _citation), source)| (idx, source))
432 .collect();
433
434 let global_ptr_to_index: HashMap<*const Citation, usize> = citations
436 .iter()
437 .enumerate()
438 .map(|(i, citation)| (citation as *const Citation, i))
439 .collect();
440
441 if self.config.group_by_year {
442 let year_groups = Self::group_by_year_with_indices(citations);
443 if self.config.run_in_parallel {
444 use rayon::prelude::*;
445
446 let duplicate_groups: Result<Vec<_>, _> = year_groups
447 .par_iter()
448 .map(|(_, citations_with_indices)| {
449 let citations_in_year: Vec<&Citation> = citations_with_indices
450 .iter()
451 .map(|(citation, _)| *citation)
452 .collect();
453 let local_to_global: HashMap<*const Citation, usize> =
455 citations_with_indices
456 .iter()
457 .map(|(citation, global_idx)| {
458 (*citation as *const Citation, *global_idx)
459 })
460 .collect();
461 self.process_citation_group_with_sources(
462 &citations_in_year,
463 &source_map,
464 &local_to_global,
465 )
466 })
467 .collect();
468
469 Ok(duplicate_groups?.into_iter().flatten().collect())
471 } else {
472 let mut duplicate_groups = Vec::new();
473
474 for citations_with_indices in year_groups.values() {
475 let citations_in_year: Vec<&Citation> = citations_with_indices
476 .iter()
477 .map(|(citation, _)| *citation)
478 .collect();
479 let local_to_global: HashMap<*const Citation, usize> = citations_with_indices
481 .iter()
482 .map(|(citation, global_idx)| (*citation as *const Citation, *global_idx))
483 .collect();
484 duplicate_groups.extend(self.process_citation_group_with_sources(
485 &citations_in_year,
486 &source_map,
487 &local_to_global,
488 )?);
489 }
490 Ok(duplicate_groups)
491 }
492 } else {
493 let citations_refs: Vec<&Citation> = citations.iter().collect();
494 self.process_citation_group_with_sources(
495 &citations_refs,
496 &source_map,
497 &global_ptr_to_index,
498 )
499 }
500 }
501
502 fn get_citation_year(citation: &Citation) -> Option<i32> {
504 Self::get_citation_year_static(citation)
505 }
506
507 fn select_unique_citation<'a>(&self, citations: &[&'a Citation]) -> &'a Citation {
508 if citations.len() == 1 {
509 return citations[0];
510 }
511
512 let citations_with_abstract: Vec<_> = citations
514 .iter()
515 .filter(|c| c.abstract_text.is_some())
516 .collect();
517
518 match citations_with_abstract.len() {
519 0 => citations[0], 1 => citations_with_abstract[0], _ => {
522 let with_doi = citations_with_abstract
524 .iter()
525 .find(|c| c.doi.as_ref().is_some_and(|d| !d.is_empty()));
526
527 with_doi.copied().unwrap_or(citations_with_abstract[0])
528 }
529 }
530 }
531
532 fn select_unique_citation_with_sources<'a>(
533 &self,
534 citations: &[&'a Citation],
535 citation_indices: &[usize],
536 source_map: &HashMap<usize, Option<&str>>,
537 ) -> &'a Citation {
538 if citations.len() == 1 {
539 return citations[0];
540 }
541
542 if !self.config.source_preferences.is_empty() {
544 for preferred_source in &self.config.source_preferences {
545 for (citation, &idx) in citations.iter().zip(citation_indices.iter()) {
546 if source_map.get(&idx) == Some(&Some(preferred_source.as_str())) {
547 return citation;
548 }
549 }
550 }
551 }
552
553 self.select_unique_citation(citations)
555 }
556
557 fn process_citation_group_with_sources(
558 &self,
559 citations: &[&Citation],
560 source_map: &HashMap<usize, Option<&str>>,
561 global_ptr_to_index: &HashMap<*const Citation, usize>,
562 ) -> Result<Vec<DuplicateGroup>, DedupeError> {
563 let mut duplicate_groups = Vec::new();
564
565 let preprocessed: Vec<PreprocessedCitation> = citations
567 .iter()
568 .map(|c| {
569 Ok(PreprocessedCitation {
570 original: c,
571 normalized_title: Self::normalize_string(&Self::convert_unicode_string(
572 &c.title,
573 ))
574 .ok_or_else(|| {
575 DedupeError::ProcessingError("Failed to normalize title".to_string())
576 })?,
577 normalized_journal: Self::format_journal_name(c.journal.as_deref()),
578 normalized_journal_abbr: Self::format_journal_name(c.journal_abbr.as_deref()),
579 normalized_volume: c
580 .volume
581 .as_deref()
582 .map_or(String::new(), Deduplicator::normalize_volume),
583 normalized_issn: c
584 .issn
585 .iter()
586 .filter_map(|issn| Deduplicator::format_issn(issn))
587 .collect(),
588 })
589 })
590 .collect::<Result<Vec<_>, _>>()?;
591
592 let mut processed_indices = std::collections::HashSet::new();
593
594 for i in 0..preprocessed.len() {
595 if processed_indices.contains(&i) {
596 continue;
597 }
598
599 let mut group_citations = vec![preprocessed[i].original];
600 let mut group_indices = vec![i];
601 let current = &preprocessed[i];
602
603 for (j, other) in preprocessed.iter().enumerate() {
604 if i == j || processed_indices.contains(&j) {
605 continue;
606 }
607
608 let journal_match = Self::journals_match(
609 ¤t.normalized_journal,
610 ¤t.normalized_journal_abbr,
611 &other.normalized_journal,
612 &other.normalized_journal_abbr,
613 );
614 let issns_match =
615 Self::match_issns(¤t.normalized_issn, &other.normalized_issn);
616 let volumes_match = !current.normalized_volume.is_empty()
617 && !other.normalized_volume.is_empty()
618 && current.normalized_volume == other.normalized_volume;
619 let pages_match = current.original.pages.is_some()
620 && other.original.pages.is_some()
621 && current.original.pages == other.original.pages;
622 let years_match = Self::get_citation_year(current.original)
623 == Self::get_citation_year(other.original);
624
625 let is_duplicate = match (¤t.original.doi, &other.original.doi) {
626 (Some(doi1), Some(doi2)) if !doi1.is_empty() && !doi2.is_empty() => {
628 let title_similarity =
629 jaro(¤t.normalized_title, &other.normalized_title);
630
631 (doi1 == doi2 && title_similarity >= DOI_TITLE_SIMILARITY_THRESHOLD && (journal_match || issns_match))
633 || (doi1 == doi2 && title_similarity >= 0.99 && (volumes_match || pages_match))
635 || (title_similarity >= 0.99 && years_match && (volumes_match || pages_match) && (journal_match || issns_match))
637 }
638 _ => {
640 let title_similarity =
641 jaro_winkler(¤t.normalized_title, &other.normalized_title);
642
643 (title_similarity >= NO_DOI_TITLE_SIMILARITY_THRESHOLD && (volumes_match || pages_match) && (journal_match || issns_match))
645 || (title_similarity >= 0.99 && years_match && (volumes_match && pages_match))
647 }
648 };
649
650 if is_duplicate {
651 group_citations.push(other.original);
652 group_indices.push(j);
653 processed_indices.insert(j);
654 }
655 }
656
657 if group_citations.len() > 1 {
658 let original_indices: Vec<usize> = group_indices
660 .iter()
661 .map(|&local_idx| {
662 let citation_ptr = preprocessed[local_idx].original as *const Citation;
663 global_ptr_to_index[&citation_ptr]
664 })
665 .collect();
666
667 let unique = self.select_unique_citation_with_sources(
668 &group_citations,
669 &original_indices,
670 source_map,
671 );
672
673 let duplicates: Vec<Citation> = group_citations
674 .into_iter()
675 .filter(|c| !std::ptr::eq(*c, unique))
676 .map(|c| (*c).clone())
677 .collect();
678
679 duplicate_groups.push(DuplicateGroup {
680 unique: unique.clone(),
681 duplicates,
682 });
683 processed_indices.insert(i);
684 } else {
685 duplicate_groups.push(DuplicateGroup {
686 unique: current.original.clone(),
687 duplicates: Vec::new(),
688 });
689 }
690 }
691
692 Ok(duplicate_groups)
693 }
694
695 fn group_by_year_with_indices(citations: &[Citation]) -> HashMap<i32, Vec<(&Citation, usize)>> {
696 let mut year_map: HashMap<i32, Vec<(&Citation, usize)>> = HashMap::new();
697
698 for (index, citation) in citations.iter().enumerate() {
700 let year = Self::get_citation_year_static(citation).unwrap_or(0);
701 year_map.entry(year).or_default().push((citation, index));
702 }
703
704 year_map
705 }
706 fn get_citation_year_static(citation: &Citation) -> Option<i32> {
708 citation.date.as_ref().map(|d| d.year)
709 }
710
711 fn convert_unicode_string(input: &str) -> String {
712 UNICODE_REGEX
713 .replace_all(input, |caps: &crate::regex::Captures| {
714 u32::from_str_radix(&caps[1], 16)
715 .ok()
716 .and_then(char::from_u32)
717 .map(|c| c.to_string())
718 .unwrap_or_else(|| caps[0].to_string())
719 })
720 .to_string()
721 }
722
723 fn normalize_string(string: &str) -> Option<String> {
724 if string.is_empty() {
725 return None;
726 }
727
728 let mut result = String::with_capacity(string.len());
729 let mut s = string.trim().to_lowercase();
730
731 for replacement in HTML_REPLACEMENTS.iter() {
732 s = s.replace(replacement.0, replacement.1);
733 }
734
735 s.chars()
736 .filter(|c| c.is_alphanumeric())
737 .for_each(|c| result.push(c));
738
739 Some(result)
740 }
741
742 fn normalize_volume(volume: &str) -> String {
743 if volume.is_empty() {
744 return String::new();
745 }
746
747 let numbers: String = volume
749 .chars()
750 .skip_while(|c| !c.is_numeric())
751 .take_while(|c| c.is_numeric())
752 .collect();
753
754 if numbers.is_empty() {
755 String::new()
756 } else {
757 numbers
758 }
759 }
760
761 fn journals_match(
763 journal1: &Option<String>,
764 journal_abbr1: &Option<String>,
765 journal2: &Option<String>,
766 journal_abbr2: &Option<String>,
767 ) -> bool {
768 journal1
769 .as_ref()
770 .zip(journal2.as_ref())
771 .is_some_and(|(j1, j2)| j1 == j2)
772 || journal_abbr1
773 .as_ref()
774 .zip(journal_abbr2.as_ref())
775 .is_some_and(|(a1, a2)| a1 == a2)
776 || journal1
777 .as_ref()
778 .zip(journal_abbr2.as_ref())
779 .is_some_and(|(j1, a2)| j1 == a2)
780 || journal_abbr1
781 .as_ref()
782 .zip(journal2.as_ref())
783 .is_some_and(|(a1, j2)| a1 == j2)
784 }
785
786 fn format_journal_name(full_name: Option<&str>) -> Option<String> {
787 full_name.map(|name| {
788 name.split(". Conference")
789 .next()
790 .unwrap_or(name)
791 .trim()
792 .to_lowercase()
793 .chars()
794 .filter(|c| c.is_alphanumeric())
795 .collect::<String>()
796 })
797 }
798
799 fn format_issn(issn_str: &str) -> Option<String> {
800 let clean_issn = issn_str
802 .trim()
803 .replace("(Electronic)", "")
804 .replace("(Linking)", "")
805 .replace("(Print)", "")
806 .replace(|c: char| !c.is_ascii_digit() && c != '-' && c != 'X', "")
807 .trim()
808 .to_string();
809
810 let digits: String = clean_issn
812 .chars()
813 .filter(|c| c.is_ascii_digit() || *c == 'X')
814 .collect();
815
816 match (clean_issn.len(), digits.len()) {
818 (9, 8) if clean_issn.chars().nth(4) == Some('-') => Some(clean_issn),
820 (8, 8) => Some(format!("{}-{}", &digits[..4], &digits[4..])),
821 _ => None,
822 }
823 }
824
825 fn match_issns(list1: &[String], list2: &[String]) -> bool {
826 list1
827 .iter()
828 .any(|isbn1| list2.iter().any(|isbn2| isbn1 == isbn2))
829 }
830}
831
832#[cfg(test)]
833mod tests {
834 use super::*;
835
836 #[test]
837 fn test_group_by_year() {
838 let citations = vec![
839 Citation {
840 title: "Title 1".to_string(),
841 authors: vec![],
842 journal: None,
843 journal_abbr: None,
844 date: Some(crate::Date {
845 year: 2020,
846 month: None,
847 day: None,
848 }),
849 volume: None,
850 abstract_text: None,
851 doi: None,
852 ..Default::default()
853 },
854 Citation {
855 title: "Title 2".to_string(),
856 authors: vec![],
857 journal: None,
858 journal_abbr: None,
859 date: None,
860 volume: None,
861 abstract_text: None,
862 doi: None,
863 ..Default::default()
864 },
865 ];
866
867 let grouped = Deduplicator::group_by_year_with_indices(&citations);
868 assert_eq!(grouped.get(&2020).unwrap().len(), 1);
869 assert_eq!(grouped.get(&0).unwrap().len(), 1);
870 }
871
872 #[test]
873 fn test_find_duplicates() {
874 let citations = vec![
875 Citation {
876 title: "Title 1".to_string(),
877 date: Some(crate::Date {
878 year: 2020,
879 month: None,
880 day: None,
881 }),
882 doi: Some("10.1234/abc".to_string()),
883 journal: Some("Journal 1".to_string()),
884 ..Default::default()
885 },
886 Citation {
887 title: "Title 1".to_string(),
888 date: Some(crate::Date {
889 year: 2020,
890 month: None,
891 day: None,
892 }),
893 doi: Some("10.1234/abc".to_string()),
894 journal: Some("Journal 1".to_string()),
895 ..Default::default()
896 },
897 Citation {
898 title: "Title 2".to_string(),
899 date: Some(crate::Date {
900 year: 2020,
901 month: None,
902 day: None,
903 }),
904 doi: Some("10.1234/def".to_string()),
905 journal: Some("Journal 2".to_string()),
906 ..Default::default()
907 },
908 ];
909
910 let deduplicator = Deduplicator::new();
911 let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
912
913 assert_eq!(duplicate_groups.len(), 2);
914 assert_eq!(
915 duplicate_groups
916 .iter()
917 .find(|g| g.unique.doi == Some("10.1234/abc".to_string()))
918 .unwrap()
919 .duplicates
920 .len(),
921 1
922 );
923 }
924
925 #[test]
926 fn test_missing_doi() {
927 let citations = vec![
928 Citation {
929 title: "Title 1".to_string(),
930 date: Some(crate::Date {
931 year: 2020,
932 month: None,
933 day: None,
934 }),
935 doi: Some("10.1234/abc".to_string()),
936 journal: Some("Journal 1".to_string()),
937 volume: Some("24".to_string()),
938 ..Default::default()
939 },
940 Citation {
941 title: "Title 1".to_string(),
942 date: Some(crate::Date {
943 year: 2020,
944 month: None,
945 day: None,
946 }),
947 doi: Some("".to_string()),
948 journal: Some("Journal 1".to_string()),
949 volume: Some("24".to_string()),
950 ..Default::default()
951 },
952 Citation {
953 title: "Title 2".to_string(),
954 date: Some(crate::Date {
955 year: 2020,
956 month: None,
957 day: None,
958 }),
959 doi: Some("".to_string()),
960 journal: Some("Journal 2".to_string()),
961 ..Default::default()
962 },
963 ];
964
965 let deduplicator = Deduplicator::new();
966 let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
967
968 assert_eq!(duplicate_groups.len(), 2);
969 }
970
971 #[test]
972 fn test_normalize_string() {
973 assert_eq!(
974 Deduplicator::normalize_string("Machine Learning! (2<sup>nd</sup> Edition)"),
975 Some("machinelearning2ndedition".to_string())
976 );
977 assert_eq!(
978 Deduplicator::normalize_string("[<sup>11</sup>C] benzo"),
979 Some("11cbenzo".to_string())
980 );
981 }
982
983 #[test]
984 fn test_convert_unicode_string() {
985 assert_eq!(
987 Deduplicator::convert_unicode_string("2<U+0391>-amino-4<U+0391>"),
988 "2Α-amino-4Α",
989 "Failed to convert basic Alpha Unicode sequences"
990 );
991
992 assert_eq!(
994 Deduplicator::convert_unicode_string("Hello <U+03A9>orld <U+03A3>cience"),
995 "Hello Ωorld Σcience",
996 "Failed to convert multiple Unicode sequences"
997 );
998
999 assert_eq!(
1001 Deduplicator::convert_unicode_string("Normal String"),
1002 "Normal String",
1003 "Incorrectly modified string with no Unicode sequences"
1004 );
1005
1006 assert_eq!(
1008 Deduplicator::convert_unicode_string(""),
1009 "",
1010 "Failed to handle empty string"
1011 );
1012
1013 assert_eq!(
1015 Deduplicator::convert_unicode_string("Mixed <U+0394> Unicode <U+03A9> Test"),
1016 "Mixed Δ Unicode Ω Test",
1017 "Failed to handle mixed content with Unicode sequences"
1018 );
1019
1020 assert_eq!(
1022 Deduplicator::convert_unicode_string("<U+0391><U+0392><U+0393>"),
1023 "ΑΒΓ",
1024 "Failed to convert consecutive Unicode sequences"
1025 );
1026 }
1027
1028 #[test]
1029 fn test_normalize_volume() {
1030 assert_eq!(Deduplicator::normalize_volume("61"), "61");
1031 assert_eq!(Deduplicator::normalize_volume("61 (Supplement 1)"), "61");
1032 assert_eq!(Deduplicator::normalize_volume("9 (8) (no pagination)"), "9");
1033 assert_eq!(Deduplicator::normalize_volume("3)"), "3");
1034 assert_eq!(Deduplicator::normalize_volume("Part A. 242"), "242");
1035 assert_eq!(Deduplicator::normalize_volume("55 (10 SUPPL 1)"), "55");
1036 assert_eq!(Deduplicator::normalize_volume("161A"), "161");
1037 assert_eq!(Deduplicator::normalize_volume("74 Suppl 1"), "74");
1038 assert_eq!(Deduplicator::normalize_volume("20 (2)"), "20");
1039 assert_eq!(
1040 Deduplicator::normalize_volume("9 (FEB) (no pagination)"),
1041 "9"
1042 );
1043 }
1044
1045 #[test]
1046 fn test_format_journal_name() {
1047 assert_eq!(
1048 Deduplicator::format_journal_name(Some(
1049 "Heart. Conference: British Atherosclerosis Society BAS/British Society for Cardiovascular Research BSCR Annual Meeting"
1050 )),
1051 Some("heart".to_string())
1052 );
1053 assert_eq!(
1054 Deduplicator::format_journal_name(Some(
1055 "The FASEB Journal. Conference: Experimental Biology"
1056 )),
1057 Some("thefasebjournal".to_string())
1058 );
1059 assert_eq!(
1060 Deduplicator::format_journal_name(Some(
1061 "Arteriosclerosis Thrombosis and Vascular Biology. Conference: American Heart Association's Arteriosclerosis Thrombosis and Vascular Biology"
1062 )),
1063 Some("arteriosclerosisthrombosisandvascularbiology".to_string())
1064 );
1065 assert_eq!(Deduplicator::format_journal_name(None), None);
1066 assert_eq!(
1067 Deduplicator::format_journal_name(Some("")),
1068 Some("".to_string())
1069 );
1070 assert_eq!(
1071 Deduplicator::format_journal_name(Some("Diabetologie und Stoffwechsel. Conference")),
1072 Some("diabetologieundstoffwechsel".to_string())
1073 );
1074 }
1075
1076 #[test]
1077 fn test_match_issns_scenarios() {
1078 let issns1 = vec!["1234-5678".to_string(), "8765-4321".to_string()];
1080 let issns2 = vec!["0000-0000".to_string(), "1234-5678".to_string()];
1081 assert!(
1082 Deduplicator::match_issns(&issns1, &issns2),
1083 "Should find a matching ISSN"
1084 );
1085
1086 let non_match_issns2 = vec!["5555-6666".to_string(), "7777-8888".to_string()];
1087 assert!(
1088 !Deduplicator::match_issns(&issns1, &non_match_issns2),
1089 "Should not find a matching ISSN"
1090 );
1091
1092 let empty_issns1: Vec<String> = vec![];
1094 let empty_issns2: Vec<String> = vec![];
1095 assert!(
1096 !Deduplicator::match_issns(&empty_issns1, &empty_issns2),
1097 "Should return false for empty lists"
1098 );
1099
1100 let partial_issns1 = vec!["1234-5678".to_string()];
1102 let partial_issns2: Vec<String> = vec![];
1103 assert!(
1104 !Deduplicator::match_issns(&partial_issns1, &partial_issns2),
1105 "Should return false when one list is empty"
1106 );
1107 }
1108
1109 #[test]
1110 fn test_format_issn() {
1111 assert_eq!(
1112 Deduplicator::format_issn("1234-5678"),
1113 Some("1234-5678".to_string())
1114 );
1115 assert_eq!(
1116 Deduplicator::format_issn("12345678"),
1117 Some("1234-5678".to_string())
1118 );
1119 assert_eq!(
1120 Deduplicator::format_issn("1234-567X"),
1121 Some("1234-567X".to_string())
1122 );
1123 assert_eq!(
1124 Deduplicator::format_issn("1234-567X (Electronic)"),
1125 Some("1234-567X".to_string())
1126 );
1127 assert_eq!(
1128 Deduplicator::format_issn("1234-5678 (Print)"),
1129 Some("1234-5678".to_string())
1130 );
1131 assert_eq!(
1132 Deduplicator::format_issn("1234-5678 (Linking)"),
1133 Some("1234-5678".to_string())
1134 );
1135 assert_eq!(Deduplicator::format_issn("invalid"), None);
1136 assert_eq!(Deduplicator::format_issn("1234-56789"), None);
1137 assert_eq!(Deduplicator::format_issn("123-45678"), None);
1138 }
1139
1140 #[test]
1141 fn test_without_year_grouping() {
1142 let citations = vec![
1143 Citation {
1144 title: "Title 1".to_string(),
1145 date: Some(crate::Date {
1146 year: 2020,
1147 month: None,
1148 day: None,
1149 }),
1150 doi: Some("10.1234/abc".to_string()),
1151 journal: Some("Journal 1".to_string()),
1152 ..Default::default()
1153 },
1154 Citation {
1155 title: "Title 1".to_string(),
1156 date: Some(crate::Date {
1157 year: 2019, month: None,
1159 day: None,
1160 }),
1161 doi: Some("10.1234/abc".to_string()),
1162 journal: Some("Journal 1".to_string()),
1163 ..Default::default()
1164 },
1165 ];
1166
1167 let config = DeduplicatorConfig {
1168 group_by_year: false,
1169 ..Default::default()
1170 };
1171 let deduplicator = Deduplicator::new().with_config(config);
1172 let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1173
1174 assert_eq!(duplicate_groups.len(), 1);
1175 assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1176
1177 let deduplicator = Deduplicator::new();
1179 let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1180
1181 assert_eq!(duplicate_groups.len(), 2);
1182 assert!(duplicate_groups.iter().all(|g| g.duplicates.is_empty()));
1183 }
1184
1185 #[test]
1186 fn test_source_preferences() {
1187 let citations = vec![
1188 Citation {
1189 title: "Title 1".to_string(),
1190 doi: Some("10.1234/abc".to_string()),
1191 journal: Some("Journal 1".to_string()),
1192 date: Some(crate::Date {
1193 year: 2020,
1194 month: None,
1195 day: None,
1196 }),
1197 ..Default::default()
1198 },
1199 Citation {
1200 title: "Title 1".to_string(),
1201 doi: Some("10.1234/abc".to_string()),
1202 journal: Some("Journal 1".to_string()),
1203 date: Some(crate::Date {
1204 year: 2020,
1205 month: None,
1206 day: None,
1207 }),
1208 ..Default::default()
1209 },
1210 ];
1211
1212 let sources = vec!["source2", "source1"];
1213
1214 let config = DeduplicatorConfig {
1215 source_preferences: vec!["source1".to_string(), "source2".to_string()],
1216 ..Default::default()
1217 };
1218
1219 let deduplicator = Deduplicator::new().with_config(config);
1220 let duplicate_groups = deduplicator
1221 .find_duplicates_with_sources(&citations, &sources)
1222 .unwrap();
1223
1224 assert_eq!(duplicate_groups.len(), 1);
1225 assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1228 }
1229
1230 #[test]
1231 fn test_abstract_preference() {
1232 let citations = vec![
1233 Citation {
1234 title: "Title 1".to_string(),
1235 abstract_text: None,
1236 doi: Some("10.1234/abc".to_string()),
1237 journal: Some("Journal 1".to_string()),
1238 date: Some(crate::Date {
1239 year: 2020,
1240 month: None,
1241 day: None,
1242 }),
1243 ..Default::default()
1244 },
1245 Citation {
1246 title: "Title 1".to_string(),
1247 abstract_text: Some("Abstract".to_string()),
1248 doi: Some("10.1234/abc".to_string()),
1249 journal: Some("Journal 1".to_string()),
1250 date: Some(crate::Date {
1251 year: 2020,
1252 month: None,
1253 day: None,
1254 }),
1255 ..Default::default()
1256 },
1257 ];
1258
1259 let deduplicator = Deduplicator::new();
1260 let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1261
1262 assert_eq!(duplicate_groups.len(), 1);
1263 assert!(duplicate_groups[0].unique.abstract_text.is_some());
1265 assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1266 }
1267
1268 #[test]
1269 fn test_source_preferences_with_year_grouping() {
1270 let citations = vec![
1272 Citation {
1273 title: "Test Article 2020".to_string(),
1274 doi: Some("10.1234/test2020".to_string()),
1275 journal: Some("Test Journal".to_string()),
1276 date: Some(crate::Date {
1277 year: 2020,
1278 month: None,
1279 day: None,
1280 }),
1281 ..Default::default()
1282 },
1283 Citation {
1284 title: "Test Article 2020".to_string(), doi: Some("10.1234/test2020".to_string()),
1286 journal: Some("Test Journal".to_string()),
1287 date: Some(crate::Date {
1288 year: 2020,
1289 month: None,
1290 day: None,
1291 }),
1292 ..Default::default()
1293 },
1294 Citation {
1295 title: "Test Article 2021".to_string(),
1296 doi: Some("10.1234/test2021".to_string()),
1297 journal: Some("Test Journal".to_string()),
1298 date: Some(crate::Date {
1299 year: 2021,
1300 month: None,
1301 day: None,
1302 }),
1303 ..Default::default()
1304 },
1305 Citation {
1306 title: "Test Article 2021".to_string(), doi: Some("10.1234/test2021".to_string()),
1308 journal: Some("Test Journal".to_string()),
1309 date: Some(crate::Date {
1310 year: 2021,
1311 month: None,
1312 day: None,
1313 }),
1314 ..Default::default()
1315 },
1316 ];
1317
1318 let sources = vec!["Embase", "PubMed", "Embase", "PubMed"];
1320
1321 let config = DeduplicatorConfig {
1322 group_by_year: true, run_in_parallel: false,
1324 source_preferences: vec!["PubMed".to_string(), "Embase".to_string()],
1325 };
1326
1327 let deduplicator = Deduplicator::new().with_config(config);
1328 let duplicate_groups = deduplicator
1329 .find_duplicates_with_sources(&citations, &sources)
1330 .unwrap();
1331
1332 assert_eq!(duplicate_groups.len(), 2);
1334
1335 let unique_titles: Vec<&str> = duplicate_groups
1338 .iter()
1339 .map(|group| group.unique.title.as_str())
1340 .collect();
1341
1342 assert!(unique_titles.contains(&"Test Article 2020"));
1343 assert!(unique_titles.contains(&"Test Article 2021"));
1344
1345 for group in &duplicate_groups {
1347 assert_eq!(group.duplicates.len(), 1);
1348 }
1349 }
1350}