use std::collections::{HashMap, HashSet};
use strsim::jaro_winkler;
use crate::models::Paper;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DuplicateStrategy {
First,
Last,
Mark,
}
pub fn find_duplicates(papers: &[Paper]) -> Vec<Vec<usize>> {
let mut groups: Vec<Vec<usize>> = Vec::new();
let mut processed: HashSet<usize> = HashSet::new();
for i in 0..papers.len() {
if processed.contains(&i) {
continue;
}
let mut group = vec![i];
let paper_i = &papers[i];
for (j, paper_j) in papers.iter().enumerate().skip(i + 1) {
if processed.contains(&j) {
continue;
}
if are_duplicates(paper_i, paper_j) {
group.push(j);
processed.insert(j);
}
}
if group.len() > 1 {
groups.push(group);
}
processed.insert(i);
}
groups
}
fn are_duplicates(a: &Paper, b: &Paper) -> bool {
if a.source == b.source {
return false;
}
if let (Some(doi_a), Some(doi_b)) = (&a.doi, &b.doi) {
if doi_a.to_lowercase() == doi_b.to_lowercase() {
return true;
}
}
let title_a = a.title.to_lowercase().trim().to_string();
let title_b = b.title.to_lowercase().trim().to_string();
let title_similarity = jaro_winkler(&title_a, &title_b);
if title_similarity >= 0.95 {
if authors_match(a, b) {
return true;
}
}
if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
return true;
}
false
}
fn authors_match(a: &Paper, b: &Paper) -> bool {
let authors_a: HashSet<String> = a
.author_list()
.iter()
.map(|s| s.to_lowercase().trim().to_string())
.collect();
let authors_b: HashSet<String> = b
.author_list()
.iter()
.map(|s| s.to_lowercase().trim().to_string())
.collect();
if authors_a.is_empty() || authors_b.is_empty() {
return true; }
authors_a.intersection(&authors_b).count() > 0
}
fn normalize_title(title: &str) -> String {
title
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub fn deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
let groups = find_duplicates(&papers);
if groups.is_empty() {
return papers;
}
let mut to_remove: HashSet<usize> = HashSet::new();
for group in groups {
match strategy {
DuplicateStrategy::First => {
for idx in group.iter().skip(1) {
to_remove.insert(*idx);
}
}
DuplicateStrategy::Last => {
for idx in group.iter().take(group.len() - 1) {
to_remove.insert(*idx);
}
}
DuplicateStrategy::Mark => {
}
}
}
if to_remove.is_empty() {
return papers;
}
papers
.into_iter()
.enumerate()
.filter(|(i, _)| !to_remove.contains(i))
.map(|(_, p)| p)
.collect()
}
pub fn fast_deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
if papers.len() <= 1 {
return papers;
}
let mut doi_map: HashMap<String, Vec<usize>> = HashMap::new();
let mut title_map: HashMap<String, Vec<usize>> = HashMap::new();
for (idx, paper) in papers.iter().enumerate() {
if let Some(ref doi) = paper.doi {
let doi_key = doi.to_lowercase();
doi_map.entry(doi_key).or_default().push(idx);
}
let normalized = normalize_title(&paper.title.to_lowercase());
title_map.entry(normalized).or_default().push(idx);
}
let mut duplicates: HashSet<usize> = HashSet::new();
for (_, indices) in doi_map.into_iter() {
if indices.len() > 1 {
match strategy {
DuplicateStrategy::First => {
for idx in indices.iter().skip(1) {
duplicates.insert(*idx);
}
}
DuplicateStrategy::Last => {
for idx in indices.iter().take(indices.len() - 1) {
duplicates.insert(*idx);
}
}
DuplicateStrategy::Mark => {
}
}
}
}
for (_, indices) in title_map.into_iter() {
if indices.len() > 1 {
let mut to_mark: Vec<usize> = Vec::new();
for i in 0..indices.len() {
if duplicates.contains(&indices[i]) {
continue;
}
for j in (i + 1)..indices.len() {
if duplicates.contains(&indices[j]) {
continue;
}
let paper_i = &papers[indices[i]];
let paper_j = &papers[indices[j]];
if paper_i.source == paper_j.source {
continue;
}
if let (Some(doi_i), Some(doi_j)) = (&paper_i.doi, &paper_j.doi) {
if doi_i.to_lowercase() == doi_j.to_lowercase() {
continue; }
}
if title_similarity_confidence(paper_i, paper_j) {
match strategy {
DuplicateStrategy::First => to_mark.push(indices[j]),
DuplicateStrategy::Last => to_mark.push(indices[i]),
DuplicateStrategy::Mark => {}
}
}
}
}
for idx in to_mark {
duplicates.insert(idx);
}
}
}
papers
.into_iter()
.enumerate()
.filter(|(i, _)| !duplicates.contains(i))
.map(|(_, p)| p)
.collect()
}
fn title_similarity_confidence(a: &Paper, b: &Paper) -> bool {
let title_a = a.title.to_lowercase().trim().to_string();
let title_b = b.title.to_lowercase().trim().to_string();
let similarity = jaro_winkler(&title_a, &title_b);
if similarity >= 0.95 && authors_match(a, b) {
return true;
}
if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::{PaperBuilder, SourceType};
#[test]
fn test_normalize_title() {
assert_eq!(normalize_title("Hello, World!"), "Hello World");
assert_eq!(normalize_title("Test Title"), "Test Title");
assert_eq!(normalize_title("Test: A-B/C"), "Test ABC");
assert_eq!(normalize_title(""), "");
assert_eq!(normalize_title(" "), "");
}
#[test]
fn test_deduplicate_by_doi() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].paper_id, "1");
}
#[test]
fn test_deduplicate_by_doi_case_insensitive() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/TEST")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_deduplicate_by_title() {
let papers = vec![
PaperBuilder::new(
"1",
"Machine Learning for Cats",
"https://arxiv.org/1",
SourceType::Arxiv,
)
.authors("John Doe")
.build(),
PaperBuilder::new(
"2",
"Machine Learning for Cats",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.authors("John Doe; Jane Smith")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_deduplicate_keep_last() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::Last);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].paper_id, "2");
}
#[test]
fn test_deduplicate_mark_strategy() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::Mark);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_no_duplicates_same_source() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
PaperBuilder::new("2", "Test Paper", "https://arxiv.org/2", SourceType::Arxiv).build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_no_duplicates_different_titles() {
let papers = vec![
PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
.authors("John Doe")
.build(),
PaperBuilder::new(
"2",
"Paper B",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.authors("John Doe")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_no_duplicates_no_common_authors() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.authors("John Doe")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.authors("Jane Smith")
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_deduplicate_empty_list() {
let papers = vec![];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 0);
}
#[test]
fn test_deduplicate_single_paper() {
let papers =
vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.build(),
];
let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_find_duplicates() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
PaperBuilder::new("3", "Other Paper", "https://arxiv.org/3", SourceType::Arxiv).build(),
];
let groups = find_duplicates(&papers);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0], vec![0, 1]);
}
#[test]
fn test_find_duplicates_empty() {
let papers = vec![];
let groups = find_duplicates(&papers);
assert_eq!(groups.len(), 0);
}
#[test]
fn test_authors_match_no_authors() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.build(),
];
let deduped = deduplicate_papers(papers.clone(), DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_fast_deduplicate_by_doi() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].paper_id, "1");
}
#[test]
fn test_fast_deduplicate_by_title() {
let papers = vec![
PaperBuilder::new(
"1",
"Machine Learning for Cats",
"https://arxiv.org/1",
SourceType::Arxiv,
)
.authors("John Doe")
.build(),
PaperBuilder::new(
"2",
"Machine Learning for Cats",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.authors("John Doe; Jane Smith")
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_fast_deduplicate_keep_last() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::Last);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].paper_id, "2");
}
#[test]
fn test_fast_deduplicate_empty() {
let papers = vec![];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 0);
}
#[test]
fn test_fast_deduplicate_single() {
let papers =
vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
}
#[test]
fn test_fast_no_duplicates_different_titles() {
let papers = vec![
PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
.authors("John Doe")
.build(),
PaperBuilder::new(
"2",
"Paper B",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.authors("John Doe")
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 2);
}
#[test]
fn test_fast_deduplicate_multiple_sources() {
let papers = vec![
PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"2",
"Test Paper",
"https://semantic.org/2",
SourceType::SemanticScholar,
)
.doi("10.1234/test")
.build(),
PaperBuilder::new(
"3",
"Test Paper",
"https://openalex.org/3",
SourceType::OpenAlex,
)
.doi("10.1234/test")
.build(),
];
let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].paper_id, "1");
}
}