1use std::collections::{HashMap, HashSet};
4use strsim::jaro_winkler;
5
6use crate::models::Paper;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum DuplicateStrategy {
11 First,
13 Last,
15 Mark,
17}
18
19pub fn find_duplicates(papers: &[Paper]) -> Vec<Vec<usize>> {
23 let mut groups: Vec<Vec<usize>> = Vec::new();
24 let mut processed: HashSet<usize> = HashSet::new();
25
26 for i in 0..papers.len() {
27 if processed.contains(&i) {
28 continue;
29 }
30
31 let mut group = vec![i];
32 let paper_i = &papers[i];
33
34 for (j, paper_j) in papers.iter().enumerate().skip(i + 1) {
35 if processed.contains(&j) {
36 continue;
37 }
38
39 if are_duplicates(paper_i, paper_j) {
41 group.push(j);
42 processed.insert(j);
43 }
44 }
45
46 if group.len() > 1 {
47 groups.push(group);
48 }
49
50 processed.insert(i);
51 }
52
53 groups
54}
55
56fn are_duplicates(a: &Paper, b: &Paper) -> bool {
58 if a.source == b.source {
60 return false;
61 }
62
63 if let (Some(doi_a), Some(doi_b)) = (&a.doi, &b.doi) {
65 if doi_a.to_lowercase() == doi_b.to_lowercase() {
66 return true;
67 }
68 }
69
70 let title_a = a.title.to_lowercase().trim().to_string();
72 let title_b = b.title.to_lowercase().trim().to_string();
73
74 let title_similarity = jaro_winkler(&title_a, &title_b);
75
76 if title_similarity >= 0.95 {
78 if authors_match(a, b) {
80 return true;
81 }
82 }
83
84 if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
86 return true;
87 }
88
89 false
90}
91
92fn authors_match(a: &Paper, b: &Paper) -> bool {
94 let authors_a: HashSet<String> = a
95 .author_list()
96 .iter()
97 .map(|s| s.to_lowercase().trim().to_string())
98 .collect();
99 let authors_b: HashSet<String> = b
100 .author_list()
101 .iter()
102 .map(|s| s.to_lowercase().trim().to_string())
103 .collect();
104
105 if authors_a.is_empty() || authors_b.is_empty() {
107 return true; }
109
110 authors_a.intersection(&authors_b).count() > 0
112}
113
114fn normalize_title(title: &str) -> String {
116 title
117 .chars()
118 .filter(|c| c.is_alphanumeric() || c.is_whitespace())
119 .collect::<String>()
120 .split_whitespace()
121 .collect::<Vec<_>>()
122 .join(" ")
123}
124
125pub fn deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
134 let groups = find_duplicates(&papers);
135
136 if groups.is_empty() {
137 return papers;
138 }
139
140 let mut to_remove: HashSet<usize> = HashSet::new();
141
142 for group in groups {
143 match strategy {
144 DuplicateStrategy::First => {
145 for idx in group.iter().skip(1) {
147 to_remove.insert(*idx);
148 }
149 }
150 DuplicateStrategy::Last => {
151 for idx in group.iter().take(group.len() - 1) {
153 to_remove.insert(*idx);
154 }
155 }
156 DuplicateStrategy::Mark => {
157 }
160 }
161 }
162
163 if to_remove.is_empty() {
164 return papers;
165 }
166
167 papers
168 .into_iter()
169 .enumerate()
170 .filter(|(i, _)| !to_remove.contains(i))
171 .map(|(_, p)| p)
172 .collect()
173}
174
175pub fn fast_deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
184 if papers.len() <= 1 {
185 return papers;
186 }
187
188 let mut doi_map: HashMap<String, Vec<usize>> = HashMap::new();
190 let mut title_map: HashMap<String, Vec<usize>> = HashMap::new();
191
192 for (idx, paper) in papers.iter().enumerate() {
194 if let Some(ref doi) = paper.doi {
196 let doi_key = doi.to_lowercase();
197 doi_map.entry(doi_key).or_default().push(idx);
198 }
199
200 let normalized = normalize_title(&paper.title.to_lowercase());
202 title_map.entry(normalized).or_default().push(idx);
203 }
204
205 let mut duplicates: HashSet<usize> = HashSet::new();
207
208 for (_, indices) in doi_map.into_iter() {
210 if indices.len() > 1 {
211 match strategy {
212 DuplicateStrategy::First => {
213 for idx in indices.iter().skip(1) {
214 duplicates.insert(*idx);
215 }
216 }
217 DuplicateStrategy::Last => {
218 for idx in indices.iter().take(indices.len() - 1) {
219 duplicates.insert(*idx);
220 }
221 }
222 DuplicateStrategy::Mark => {
223 }
225 }
226 }
227 }
228
229 for (_, indices) in title_map.into_iter() {
232 if indices.len() > 1 {
233 let mut to_mark: Vec<usize> = Vec::new();
234
235 for i in 0..indices.len() {
237 if duplicates.contains(&indices[i]) {
238 continue;
239 }
240
241 for j in (i + 1)..indices.len() {
242 if duplicates.contains(&indices[j]) {
243 continue;
244 }
245
246 let paper_i = &papers[indices[i]];
247 let paper_j = &papers[indices[j]];
248
249 if paper_i.source == paper_j.source {
251 continue;
252 }
253
254 if let (Some(doi_i), Some(doi_j)) = (&paper_i.doi, &paper_j.doi) {
256 if doi_i.to_lowercase() == doi_j.to_lowercase() {
257 continue; }
259 }
260
261 if title_similarity_confidence(paper_i, paper_j) {
263 match strategy {
264 DuplicateStrategy::First => to_mark.push(indices[j]),
265 DuplicateStrategy::Last => to_mark.push(indices[i]),
266 DuplicateStrategy::Mark => {}
267 }
268 }
269 }
270 }
271
272 for idx in to_mark {
274 duplicates.insert(idx);
275 }
276 }
277 }
278
279 papers
281 .into_iter()
282 .enumerate()
283 .filter(|(i, _)| !duplicates.contains(i))
284 .map(|(_, p)| p)
285 .collect()
286}
287
288fn title_similarity_confidence(a: &Paper, b: &Paper) -> bool {
290 let title_a = a.title.to_lowercase().trim().to_string();
292 let title_b = b.title.to_lowercase().trim().to_string();
293 let similarity = jaro_winkler(&title_a, &title_b);
294
295 if similarity >= 0.95 && authors_match(a, b) {
296 return true;
297 }
298
299 if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
301 return true;
302 }
303
304 false
305}
306
307#[cfg(test)]
308mod tests {
309 use super::*;
310 use crate::models::{PaperBuilder, SourceType};
311
312 #[test]
313 fn test_normalize_title() {
314 assert_eq!(normalize_title("Hello, World!"), "Hello World");
315 assert_eq!(normalize_title("Test Title"), "Test Title");
316 assert_eq!(normalize_title("Test: A-B/C"), "Test ABC");
317 assert_eq!(normalize_title(""), "");
318 assert_eq!(normalize_title(" "), "");
319 }
320
321 #[test]
322 fn test_deduplicate_by_doi() {
323 let papers = vec![
324 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
325 .doi("10.1234/test")
326 .build(),
327 PaperBuilder::new(
328 "2",
329 "Test Paper",
330 "https://semantic.org/2",
331 SourceType::SemanticScholar,
332 )
333 .doi("10.1234/test")
334 .build(),
335 ];
336
337 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
338 assert_eq!(deduped.len(), 1);
339 assert_eq!(deduped[0].paper_id, "1");
340 }
341
342 #[test]
343 fn test_deduplicate_by_doi_case_insensitive() {
344 let papers = vec![
345 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
346 .doi("10.1234/TEST")
347 .build(),
348 PaperBuilder::new(
349 "2",
350 "Test Paper",
351 "https://semantic.org/2",
352 SourceType::SemanticScholar,
353 )
354 .doi("10.1234/test")
355 .build(),
356 ];
357
358 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
359 assert_eq!(deduped.len(), 1);
360 }
361
362 #[test]
363 fn test_deduplicate_by_title() {
364 let papers = vec![
365 PaperBuilder::new(
366 "1",
367 "Machine Learning for Cats",
368 "https://arxiv.org/1",
369 SourceType::Arxiv,
370 )
371 .authors("John Doe")
372 .build(),
373 PaperBuilder::new(
374 "2",
375 "Machine Learning for Cats",
376 "https://semantic.org/2",
377 SourceType::SemanticScholar,
378 )
379 .authors("John Doe; Jane Smith")
380 .build(),
381 ];
382
383 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
384 assert_eq!(deduped.len(), 1);
385 }
386
387 #[test]
388 fn test_deduplicate_keep_last() {
389 let papers = vec![
390 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
391 .doi("10.1234/test")
392 .build(),
393 PaperBuilder::new(
394 "2",
395 "Test Paper",
396 "https://semantic.org/2",
397 SourceType::SemanticScholar,
398 )
399 .doi("10.1234/test")
400 .build(),
401 ];
402
403 let deduped = deduplicate_papers(papers, DuplicateStrategy::Last);
404 assert_eq!(deduped.len(), 1);
405 assert_eq!(deduped[0].paper_id, "2");
406 }
407
408 #[test]
409 fn test_deduplicate_mark_strategy() {
410 let papers = vec![
411 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
412 .doi("10.1234/test")
413 .build(),
414 PaperBuilder::new(
415 "2",
416 "Test Paper",
417 "https://semantic.org/2",
418 SourceType::SemanticScholar,
419 )
420 .doi("10.1234/test")
421 .build(),
422 ];
423
424 let deduped = deduplicate_papers(papers, DuplicateStrategy::Mark);
425 assert_eq!(deduped.len(), 2);
427 }
428
429 #[test]
430 fn test_no_duplicates_same_source() {
431 let papers = vec![
432 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
433 PaperBuilder::new("2", "Test Paper", "https://arxiv.org/2", SourceType::Arxiv).build(),
434 ];
435
436 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
437 assert_eq!(deduped.len(), 2);
438 }
439
440 #[test]
441 fn test_no_duplicates_different_titles() {
442 let papers = vec![
443 PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
444 .authors("John Doe")
445 .build(),
446 PaperBuilder::new(
447 "2",
448 "Paper B",
449 "https://semantic.org/2",
450 SourceType::SemanticScholar,
451 )
452 .authors("John Doe")
453 .build(),
454 ];
455
456 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
457 assert_eq!(deduped.len(), 2);
458 }
459
460 #[test]
461 fn test_no_duplicates_no_common_authors() {
462 let papers = vec![
463 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
464 .authors("John Doe")
465 .build(),
466 PaperBuilder::new(
467 "2",
468 "Test Paper",
469 "https://semantic.org/2",
470 SourceType::SemanticScholar,
471 )
472 .authors("Jane Smith")
473 .build(),
474 ];
475
476 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
477 assert_eq!(deduped.len(), 2);
478 }
479
480 #[test]
481 fn test_deduplicate_empty_list() {
482 let papers = vec![];
483 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
484 assert_eq!(deduped.len(), 0);
485 }
486
487 #[test]
488 fn test_deduplicate_single_paper() {
489 let papers =
490 vec![
491 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
492 .build(),
493 ];
494
495 let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
496 assert_eq!(deduped.len(), 1);
497 }
498
499 #[test]
500 fn test_find_duplicates() {
501 let papers = vec![
502 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
503 .doi("10.1234/test")
504 .build(),
505 PaperBuilder::new(
506 "2",
507 "Test Paper",
508 "https://semantic.org/2",
509 SourceType::SemanticScholar,
510 )
511 .doi("10.1234/test")
512 .build(),
513 PaperBuilder::new("3", "Other Paper", "https://arxiv.org/3", SourceType::Arxiv).build(),
514 ];
515
516 let groups = find_duplicates(&papers);
517 assert_eq!(groups.len(), 1);
518 assert_eq!(groups[0], vec![0, 1]);
519 }
520
521 #[test]
522 fn test_find_duplicates_empty() {
523 let papers = vec![];
524 let groups = find_duplicates(&papers);
525 assert_eq!(groups.len(), 0);
526 }
527
528 #[test]
529 fn test_authors_match_no_authors() {
530 let papers = vec![
531 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
532 PaperBuilder::new(
533 "2",
534 "Test Paper",
535 "https://semantic.org/2",
536 SourceType::SemanticScholar,
537 )
538 .build(),
539 ];
540
541 let deduped = deduplicate_papers(papers.clone(), DuplicateStrategy::First);
542 assert_eq!(deduped.len(), 1);
544 }
545
546 #[test]
549 fn test_fast_deduplicate_by_doi() {
550 let papers = vec![
551 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
552 .doi("10.1234/test")
553 .build(),
554 PaperBuilder::new(
555 "2",
556 "Test Paper",
557 "https://semantic.org/2",
558 SourceType::SemanticScholar,
559 )
560 .doi("10.1234/test")
561 .build(),
562 ];
563
564 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
565 assert_eq!(deduped.len(), 1);
566 assert_eq!(deduped[0].paper_id, "1");
567 }
568
569 #[test]
570 fn test_fast_deduplicate_by_title() {
571 let papers = vec![
572 PaperBuilder::new(
573 "1",
574 "Machine Learning for Cats",
575 "https://arxiv.org/1",
576 SourceType::Arxiv,
577 )
578 .authors("John Doe")
579 .build(),
580 PaperBuilder::new(
581 "2",
582 "Machine Learning for Cats",
583 "https://semantic.org/2",
584 SourceType::SemanticScholar,
585 )
586 .authors("John Doe; Jane Smith")
587 .build(),
588 ];
589
590 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
591 assert_eq!(deduped.len(), 1);
592 }
593
594 #[test]
595 fn test_fast_deduplicate_keep_last() {
596 let papers = vec![
597 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
598 .doi("10.1234/test")
599 .build(),
600 PaperBuilder::new(
601 "2",
602 "Test Paper",
603 "https://semantic.org/2",
604 SourceType::SemanticScholar,
605 )
606 .doi("10.1234/test")
607 .build(),
608 ];
609
610 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::Last);
611 assert_eq!(deduped.len(), 1);
612 assert_eq!(deduped[0].paper_id, "2");
613 }
614
615 #[test]
616 fn test_fast_deduplicate_empty() {
617 let papers = vec![];
618 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
619 assert_eq!(deduped.len(), 0);
620 }
621
622 #[test]
623 fn test_fast_deduplicate_single() {
624 let papers =
625 vec![
626 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
627 .build(),
628 ];
629
630 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
631 assert_eq!(deduped.len(), 1);
632 }
633
634 #[test]
635 fn test_fast_no_duplicates_different_titles() {
636 let papers = vec![
637 PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
638 .authors("John Doe")
639 .build(),
640 PaperBuilder::new(
641 "2",
642 "Paper B",
643 "https://semantic.org/2",
644 SourceType::SemanticScholar,
645 )
646 .authors("John Doe")
647 .build(),
648 ];
649
650 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
651 assert_eq!(deduped.len(), 2);
652 }
653
654 #[test]
655 fn test_fast_deduplicate_multiple_sources() {
656 let papers = vec![
657 PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
658 .doi("10.1234/test")
659 .build(),
660 PaperBuilder::new(
661 "2",
662 "Test Paper",
663 "https://semantic.org/2",
664 SourceType::SemanticScholar,
665 )
666 .doi("10.1234/test")
667 .build(),
668 PaperBuilder::new(
669 "3",
670 "Test Paper",
671 "https://openalex.org/3",
672 SourceType::OpenAlex,
673 )
674 .doi("10.1234/test")
675 .build(),
676 ];
677
678 let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
679 assert_eq!(deduped.len(), 1);
680 assert_eq!(deduped[0].paper_id, "1");
681 }
682}