use crate::regex::Regex;
use crate::{Citation, DuplicateGroup};
use std::collections::HashMap;
use std::sync::LazyLock;
use strsim::jaro;
use strsim::jaro_winkler;
const DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.85;
const NO_DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.93;
static UNICODE_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"<U\+([0-9A-Fa-f]+)>").unwrap());
const HTML_REPLACEMENTS: [(&str, &str); 13] = [
("<", "<"),
(">", ">"),
("<sup>", ""),
("</sup>", ""),
("<sub>", ""),
("</sub>", ""),
("<inf>", ""),
("</inf>", ""),
("beta", "b"),
("alpha", "a"),
("α", "a"),
("ß", "b"),
("γ", "g"),
];
#[derive(Debug, Default, Clone)]
pub struct DeduplicatorConfig {
pub group_by_year: bool,
pub run_in_parallel: bool,
pub source_preferences: Vec<String>,
}
#[derive(Debug, Default, Clone)]
pub struct Deduplicator {
config: DeduplicatorConfig,
}
#[derive(Debug)]
struct PreprocessedCitation<'a> {
original: &'a Citation,
normalized_title: String,
normalized_journal: Option<String>,
normalized_journal_abbr: Option<String>,
normalized_issn: Vec<String>,
normalized_volume: String,
}
#[derive(Debug, thiserror::Error)]
pub enum DedupeError {
#[error("Invalid citation data: {0}")]
InvalidCitation(String),
#[error("Processing error: {0}")]
ProcessingError(String),
#[error("Configuration error: {0}")]
ConfigError(String),
}
impl Deduplicator {
#[must_use]
pub fn new() -> Self {
Self {
config: DeduplicatorConfig {
group_by_year: true,
run_in_parallel: false,
source_preferences: Vec::new(),
},
}
}
#[must_use]
pub fn with_config(mut self, mut config: DeduplicatorConfig) -> Self {
if !config.group_by_year {
config.run_in_parallel = false;
}
self.config = config;
self
}
pub fn find_duplicates(
self,
citations: &[Citation],
) -> Result<Vec<DuplicateGroup>, DedupeError> {
self.find_duplicates_with_sources(citations, &[])
}
pub fn find_duplicates_with_sources(
self,
citations: &[Citation],
sources: &[&str],
) -> Result<Vec<DuplicateGroup>, DedupeError> {
if citations.is_empty() {
return Ok(Vec::new());
}
if sources.len() > citations.len() {
return Err(DedupeError::ConfigError(format!(
"Number of sources ({}) exceeds number of citations ({}). Each source must correspond to a citation.",
sources.len(),
citations.len()
)));
}
let source_map: HashMap<usize, Option<&str>> = citations
.iter()
.enumerate()
.zip(
sources
.iter()
.map(|&s| Some(s))
.chain(std::iter::repeat(None)),
)
.map(|((idx, _citation), source)| (idx, source))
.collect();
let global_ptr_to_index: HashMap<*const Citation, usize> = citations
.iter()
.enumerate()
.map(|(i, citation)| (citation as *const Citation, i))
.collect();
if self.config.group_by_year {
let year_groups = Self::group_by_year_with_indices(citations);
if self.config.run_in_parallel {
use rayon::prelude::*;
let duplicate_groups: Result<Vec<_>, _> = year_groups
.par_iter()
.map(|(_, citations_with_indices)| {
let citations_in_year: Vec<&Citation> = citations_with_indices
.iter()
.map(|(citation, _)| *citation)
.collect();
let local_to_global: HashMap<*const Citation, usize> =
citations_with_indices
.iter()
.map(|(citation, global_idx)| {
(*citation as *const Citation, *global_idx)
})
.collect();
self.process_citation_group_with_sources(
&citations_in_year,
&source_map,
&local_to_global,
)
})
.collect();
Ok(duplicate_groups?.into_iter().flatten().collect())
} else {
let mut duplicate_groups = Vec::new();
for citations_with_indices in year_groups.values() {
let citations_in_year: Vec<&Citation> = citations_with_indices
.iter()
.map(|(citation, _)| *citation)
.collect();
let local_to_global: HashMap<*const Citation, usize> = citations_with_indices
.iter()
.map(|(citation, global_idx)| (*citation as *const Citation, *global_idx))
.collect();
duplicate_groups.extend(self.process_citation_group_with_sources(
&citations_in_year,
&source_map,
&local_to_global,
)?);
}
Ok(duplicate_groups)
}
} else {
let citations_refs: Vec<&Citation> = citations.iter().collect();
self.process_citation_group_with_sources(
&citations_refs,
&source_map,
&global_ptr_to_index,
)
}
}
fn get_citation_year(citation: &Citation) -> Option<i32> {
Self::get_citation_year_static(citation)
}
fn select_unique_citation<'a>(&self, citations: &[&'a Citation]) -> &'a Citation {
if citations.len() == 1 {
return citations[0];
}
let citations_with_abstract: Vec<_> = citations
.iter()
.filter(|c| c.abstract_text.is_some())
.collect();
match citations_with_abstract.len() {
0 => citations[0], 1 => citations_with_abstract[0], _ => {
let with_doi = citations_with_abstract
.iter()
.find(|c| c.doi.as_ref().is_some_and(|d| !d.is_empty()));
with_doi.copied().unwrap_or(citations_with_abstract[0])
}
}
}
fn select_unique_citation_with_sources<'a>(
&self,
citations: &[&'a Citation],
citation_indices: &[usize],
source_map: &HashMap<usize, Option<&str>>,
) -> &'a Citation {
if citations.len() == 1 {
return citations[0];
}
if !self.config.source_preferences.is_empty() {
for preferred_source in &self.config.source_preferences {
for (citation, &idx) in citations.iter().zip(citation_indices.iter()) {
if source_map.get(&idx) == Some(&Some(preferred_source.as_str())) {
return citation;
}
}
}
}
self.select_unique_citation(citations)
}
fn process_citation_group_with_sources(
&self,
citations: &[&Citation],
source_map: &HashMap<usize, Option<&str>>,
global_ptr_to_index: &HashMap<*const Citation, usize>,
) -> Result<Vec<DuplicateGroup>, DedupeError> {
let mut duplicate_groups = Vec::new();
let preprocessed: Vec<PreprocessedCitation> = citations
.iter()
.map(|c| {
Ok(PreprocessedCitation {
original: c,
normalized_title: Self::normalize_string(&Self::convert_unicode_string(
&c.title,
))
.ok_or_else(|| {
DedupeError::ProcessingError("Failed to normalize title".to_string())
})?,
normalized_journal: Self::format_journal_name(c.journal.as_deref()),
normalized_journal_abbr: Self::format_journal_name(c.journal_abbr.as_deref()),
normalized_volume: c
.volume
.as_deref()
.map_or(String::new(), Deduplicator::normalize_volume),
normalized_issn: c
.issn
.iter()
.filter_map(|issn| Deduplicator::format_issn(issn))
.collect(),
})
})
.collect::<Result<Vec<_>, _>>()?;
let mut processed_indices = std::collections::HashSet::new();
for i in 0..preprocessed.len() {
if processed_indices.contains(&i) {
continue;
}
let mut group_citations = vec![preprocessed[i].original];
let mut group_indices = vec![i];
let current = &preprocessed[i];
for (j, other) in preprocessed.iter().enumerate() {
if i == j || processed_indices.contains(&j) {
continue;
}
let journal_match = Self::journals_match(
¤t.normalized_journal,
¤t.normalized_journal_abbr,
&other.normalized_journal,
&other.normalized_journal_abbr,
);
let issns_match =
Self::match_issns(¤t.normalized_issn, &other.normalized_issn);
let volumes_match = !current.normalized_volume.is_empty()
&& !other.normalized_volume.is_empty()
&& current.normalized_volume == other.normalized_volume;
let pages_match = current.original.pages.is_some()
&& other.original.pages.is_some()
&& current.original.pages == other.original.pages;
let years_match = Self::get_citation_year(current.original)
== Self::get_citation_year(other.original);
let is_duplicate = match (¤t.original.doi, &other.original.doi) {
(Some(doi1), Some(doi2)) if !doi1.is_empty() && !doi2.is_empty() => {
let title_similarity =
jaro(¤t.normalized_title, &other.normalized_title);
(doi1 == doi2 && title_similarity >= DOI_TITLE_SIMILARITY_THRESHOLD && (journal_match || issns_match))
|| (doi1 == doi2 && title_similarity >= 0.99 && (volumes_match || pages_match))
|| (title_similarity >= 0.99 && years_match && (volumes_match || pages_match) && (journal_match || issns_match))
}
_ => {
let title_similarity =
jaro_winkler(¤t.normalized_title, &other.normalized_title);
(title_similarity >= NO_DOI_TITLE_SIMILARITY_THRESHOLD && (volumes_match || pages_match) && (journal_match || issns_match))
|| (title_similarity >= 0.99 && years_match && (volumes_match && pages_match))
}
};
if is_duplicate {
group_citations.push(other.original);
group_indices.push(j);
processed_indices.insert(j);
}
}
if group_citations.len() > 1 {
let original_indices: Vec<usize> = group_indices
.iter()
.map(|&local_idx| {
let citation_ptr = preprocessed[local_idx].original as *const Citation;
global_ptr_to_index[&citation_ptr]
})
.collect();
let unique = self.select_unique_citation_with_sources(
&group_citations,
&original_indices,
source_map,
);
let duplicates: Vec<Citation> = group_citations
.into_iter()
.filter(|c| !std::ptr::eq(*c, unique))
.map(|c| (*c).clone())
.collect();
duplicate_groups.push(DuplicateGroup {
unique: unique.clone(),
duplicates,
});
processed_indices.insert(i);
} else {
duplicate_groups.push(DuplicateGroup {
unique: current.original.clone(),
duplicates: Vec::new(),
});
}
}
Ok(duplicate_groups)
}
fn group_by_year_with_indices(citations: &[Citation]) -> HashMap<i32, Vec<(&Citation, usize)>> {
let mut year_map: HashMap<i32, Vec<(&Citation, usize)>> = HashMap::new();
for (index, citation) in citations.iter().enumerate() {
let year = Self::get_citation_year_static(citation).unwrap_or(0);
year_map.entry(year).or_default().push((citation, index));
}
year_map
}
fn get_citation_year_static(citation: &Citation) -> Option<i32> {
citation.date.as_ref().map(|d| d.year)
}
fn convert_unicode_string(input: &str) -> String {
UNICODE_REGEX
.replace_all(input, |caps: &crate::regex::Captures| {
u32::from_str_radix(&caps[1], 16)
.ok()
.and_then(char::from_u32)
.map(|c| c.to_string())
.unwrap_or_else(|| caps[0].to_string())
})
.to_string()
}
fn normalize_string(string: &str) -> Option<String> {
if string.is_empty() {
return None;
}
let mut result = String::with_capacity(string.len());
let mut s = string.trim().to_lowercase();
for replacement in HTML_REPLACEMENTS.iter() {
s = s.replace(replacement.0, replacement.1);
}
s.chars()
.filter(|c| c.is_alphanumeric())
.for_each(|c| result.push(c));
Some(result)
}
fn normalize_volume(volume: &str) -> String {
if volume.is_empty() {
return String::new();
}
let numbers: String = volume
.chars()
.skip_while(|c| !c.is_numeric())
.take_while(|c| c.is_numeric())
.collect();
if numbers.is_empty() {
String::new()
} else {
numbers
}
}
fn journals_match(
journal1: &Option<String>,
journal_abbr1: &Option<String>,
journal2: &Option<String>,
journal_abbr2: &Option<String>,
) -> bool {
journal1
.as_ref()
.zip(journal2.as_ref())
.is_some_and(|(j1, j2)| j1 == j2)
|| journal_abbr1
.as_ref()
.zip(journal_abbr2.as_ref())
.is_some_and(|(a1, a2)| a1 == a2)
|| journal1
.as_ref()
.zip(journal_abbr2.as_ref())
.is_some_and(|(j1, a2)| j1 == a2)
|| journal_abbr1
.as_ref()
.zip(journal2.as_ref())
.is_some_and(|(a1, j2)| a1 == j2)
}
fn format_journal_name(full_name: Option<&str>) -> Option<String> {
full_name.map(|name| {
name.split(". Conference")
.next()
.unwrap_or(name)
.trim()
.to_lowercase()
.chars()
.filter(|c| c.is_alphanumeric())
.collect::<String>()
})
}
fn format_issn(issn_str: &str) -> Option<String> {
let clean_issn = issn_str
.trim()
.replace("(Electronic)", "")
.replace("(Linking)", "")
.replace("(Print)", "")
.replace(|c: char| !c.is_ascii_digit() && c != '-' && c != 'X', "")
.trim()
.to_string();
let digits: String = clean_issn
.chars()
.filter(|c| c.is_ascii_digit() || *c == 'X')
.collect();
match (clean_issn.len(), digits.len()) {
(9, 8) if clean_issn.chars().nth(4) == Some('-') => Some(clean_issn),
(8, 8) => Some(format!("{}-{}", &digits[..4], &digits[4..])),
_ => None,
}
}
fn match_issns(list1: &[String], list2: &[String]) -> bool {
list1
.iter()
.any(|isbn1| list2.iter().any(|isbn2| isbn1 == isbn2))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_group_by_year() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
authors: vec![],
journal: None,
journal_abbr: None,
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
volume: None,
abstract_text: None,
doi: None,
..Default::default()
},
Citation {
title: "Title 2".to_string(),
authors: vec![],
journal: None,
journal_abbr: None,
date: None,
volume: None,
abstract_text: None,
doi: None,
..Default::default()
},
];
let grouped = Deduplicator::group_by_year_with_indices(&citations);
assert_eq!(grouped.get(&2020).unwrap().len(), 1);
assert_eq!(grouped.get(&0).unwrap().len(), 1);
}
#[test]
fn test_find_duplicates() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
..Default::default()
},
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
..Default::default()
},
Citation {
title: "Title 2".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("10.1234/def".to_string()),
journal: Some("Journal 2".to_string()),
..Default::default()
},
];
let deduplicator = Deduplicator::new();
let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
assert_eq!(duplicate_groups.len(), 2);
assert_eq!(
duplicate_groups
.iter()
.find(|g| g.unique.doi == Some("10.1234/abc".to_string()))
.unwrap()
.duplicates
.len(),
1
);
}
#[test]
fn test_missing_doi() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
volume: Some("24".to_string()),
..Default::default()
},
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("".to_string()),
journal: Some("Journal 1".to_string()),
volume: Some("24".to_string()),
..Default::default()
},
Citation {
title: "Title 2".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("".to_string()),
journal: Some("Journal 2".to_string()),
..Default::default()
},
];
let deduplicator = Deduplicator::new();
let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
assert_eq!(duplicate_groups.len(), 2);
}
#[test]
fn test_normalize_string() {
assert_eq!(
Deduplicator::normalize_string("Machine Learning! (2<sup>nd</sup> Edition)"),
Some("machinelearning2ndedition".to_string())
);
assert_eq!(
Deduplicator::normalize_string("[<sup>11</sup>C] benzo"),
Some("11cbenzo".to_string())
);
}
#[test]
fn test_convert_unicode_string() {
assert_eq!(
Deduplicator::convert_unicode_string("2<U+0391>-amino-4<U+0391>"),
"2Α-amino-4Α",
"Failed to convert basic Alpha Unicode sequences"
);
assert_eq!(
Deduplicator::convert_unicode_string("Hello <U+03A9>orld <U+03A3>cience"),
"Hello Ωorld Σcience",
"Failed to convert multiple Unicode sequences"
);
assert_eq!(
Deduplicator::convert_unicode_string("Normal String"),
"Normal String",
"Incorrectly modified string with no Unicode sequences"
);
assert_eq!(
Deduplicator::convert_unicode_string(""),
"",
"Failed to handle empty string"
);
assert_eq!(
Deduplicator::convert_unicode_string("Mixed <U+0394> Unicode <U+03A9> Test"),
"Mixed Δ Unicode Ω Test",
"Failed to handle mixed content with Unicode sequences"
);
assert_eq!(
Deduplicator::convert_unicode_string("<U+0391><U+0392><U+0393>"),
"ΑΒΓ",
"Failed to convert consecutive Unicode sequences"
);
}
#[test]
fn test_normalize_volume() {
assert_eq!(Deduplicator::normalize_volume("61"), "61");
assert_eq!(Deduplicator::normalize_volume("61 (Supplement 1)"), "61");
assert_eq!(Deduplicator::normalize_volume("9 (8) (no pagination)"), "9");
assert_eq!(Deduplicator::normalize_volume("3)"), "3");
assert_eq!(Deduplicator::normalize_volume("Part A. 242"), "242");
assert_eq!(Deduplicator::normalize_volume("55 (10 SUPPL 1)"), "55");
assert_eq!(Deduplicator::normalize_volume("161A"), "161");
assert_eq!(Deduplicator::normalize_volume("74 Suppl 1"), "74");
assert_eq!(Deduplicator::normalize_volume("20 (2)"), "20");
assert_eq!(
Deduplicator::normalize_volume("9 (FEB) (no pagination)"),
"9"
);
}
#[test]
fn test_format_journal_name() {
assert_eq!(
Deduplicator::format_journal_name(Some(
"Heart. Conference: British Atherosclerosis Society BAS/British Society for Cardiovascular Research BSCR Annual Meeting"
)),
Some("heart".to_string())
);
assert_eq!(
Deduplicator::format_journal_name(Some(
"The FASEB Journal. Conference: Experimental Biology"
)),
Some("thefasebjournal".to_string())
);
assert_eq!(
Deduplicator::format_journal_name(Some(
"Arteriosclerosis Thrombosis and Vascular Biology. Conference: American Heart Association's Arteriosclerosis Thrombosis and Vascular Biology"
)),
Some("arteriosclerosisthrombosisandvascularbiology".to_string())
);
assert_eq!(Deduplicator::format_journal_name(None), None);
assert_eq!(
Deduplicator::format_journal_name(Some("")),
Some("".to_string())
);
assert_eq!(
Deduplicator::format_journal_name(Some("Diabetologie und Stoffwechsel. Conference")),
Some("diabetologieundstoffwechsel".to_string())
);
}
#[test]
fn test_match_issns_scenarios() {
let issns1 = vec!["1234-5678".to_string(), "8765-4321".to_string()];
let issns2 = vec!["0000-0000".to_string(), "1234-5678".to_string()];
assert!(
Deduplicator::match_issns(&issns1, &issns2),
"Should find a matching ISSN"
);
let non_match_issns2 = vec!["5555-6666".to_string(), "7777-8888".to_string()];
assert!(
!Deduplicator::match_issns(&issns1, &non_match_issns2),
"Should not find a matching ISSN"
);
let empty_issns1: Vec<String> = vec![];
let empty_issns2: Vec<String> = vec![];
assert!(
!Deduplicator::match_issns(&empty_issns1, &empty_issns2),
"Should return false for empty lists"
);
let partial_issns1 = vec!["1234-5678".to_string()];
let partial_issns2: Vec<String> = vec![];
assert!(
!Deduplicator::match_issns(&partial_issns1, &partial_issns2),
"Should return false when one list is empty"
);
}
#[test]
fn test_format_issn() {
assert_eq!(
Deduplicator::format_issn("1234-5678"),
Some("1234-5678".to_string())
);
assert_eq!(
Deduplicator::format_issn("12345678"),
Some("1234-5678".to_string())
);
assert_eq!(
Deduplicator::format_issn("1234-567X"),
Some("1234-567X".to_string())
);
assert_eq!(
Deduplicator::format_issn("1234-567X (Electronic)"),
Some("1234-567X".to_string())
);
assert_eq!(
Deduplicator::format_issn("1234-5678 (Print)"),
Some("1234-5678".to_string())
);
assert_eq!(
Deduplicator::format_issn("1234-5678 (Linking)"),
Some("1234-5678".to_string())
);
assert_eq!(Deduplicator::format_issn("invalid"), None);
assert_eq!(Deduplicator::format_issn("1234-56789"), None);
assert_eq!(Deduplicator::format_issn("123-45678"), None);
}
#[test]
fn test_without_year_grouping() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
..Default::default()
},
Citation {
title: "Title 1".to_string(),
date: Some(crate::Date {
year: 2019, month: None,
day: None,
}),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
..Default::default()
},
];
let config = DeduplicatorConfig {
group_by_year: false,
..Default::default()
};
let deduplicator = Deduplicator::new().with_config(config);
let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
assert_eq!(duplicate_groups.len(), 1);
assert_eq!(duplicate_groups[0].duplicates.len(), 1);
let deduplicator = Deduplicator::new();
let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
assert_eq!(duplicate_groups.len(), 2);
assert!(duplicate_groups.iter().all(|g| g.duplicates.is_empty()));
}
#[test]
fn test_source_preferences() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
Citation {
title: "Title 1".to_string(),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
];
let sources = vec!["source2", "source1"];
let config = DeduplicatorConfig {
source_preferences: vec!["source1".to_string(), "source2".to_string()],
..Default::default()
};
let deduplicator = Deduplicator::new().with_config(config);
let duplicate_groups = deduplicator
.find_duplicates_with_sources(&citations, &sources)
.unwrap();
assert_eq!(duplicate_groups.len(), 1);
assert_eq!(duplicate_groups[0].duplicates.len(), 1);
}
#[test]
fn test_abstract_preference() {
let citations = vec![
Citation {
title: "Title 1".to_string(),
abstract_text: None,
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
Citation {
title: "Title 1".to_string(),
abstract_text: Some("Abstract".to_string()),
doi: Some("10.1234/abc".to_string()),
journal: Some("Journal 1".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
];
let deduplicator = Deduplicator::new();
let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
assert_eq!(duplicate_groups.len(), 1);
assert!(duplicate_groups[0].unique.abstract_text.is_some());
assert_eq!(duplicate_groups[0].duplicates.len(), 1);
}
#[test]
fn test_source_preferences_with_year_grouping() {
let citations = vec![
Citation {
title: "Test Article 2020".to_string(),
doi: Some("10.1234/test2020".to_string()),
journal: Some("Test Journal".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
Citation {
title: "Test Article 2020".to_string(), doi: Some("10.1234/test2020".to_string()),
journal: Some("Test Journal".to_string()),
date: Some(crate::Date {
year: 2020,
month: None,
day: None,
}),
..Default::default()
},
Citation {
title: "Test Article 2021".to_string(),
doi: Some("10.1234/test2021".to_string()),
journal: Some("Test Journal".to_string()),
date: Some(crate::Date {
year: 2021,
month: None,
day: None,
}),
..Default::default()
},
Citation {
title: "Test Article 2021".to_string(), doi: Some("10.1234/test2021".to_string()),
journal: Some("Test Journal".to_string()),
date: Some(crate::Date {
year: 2021,
month: None,
day: None,
}),
..Default::default()
},
];
let sources = vec!["Embase", "PubMed", "Embase", "PubMed"];
let config = DeduplicatorConfig {
group_by_year: true, run_in_parallel: false,
source_preferences: vec!["PubMed".to_string(), "Embase".to_string()],
};
let deduplicator = Deduplicator::new().with_config(config);
let duplicate_groups = deduplicator
.find_duplicates_with_sources(&citations, &sources)
.unwrap();
assert_eq!(duplicate_groups.len(), 2);
let unique_titles: Vec<&str> = duplicate_groups
.iter()
.map(|group| group.unique.title.as_str())
.collect();
assert!(unique_titles.contains(&"Test Article 2020"));
assert!(unique_titles.contains(&"Test Article 2021"));
for group in &duplicate_groups {
assert_eq!(group.duplicates.len(), 1);
}
}
}