Skip to main content

research_master/utils/
dedup.rs

1//! Deduplication utilities for papers across sources.
2
3use std::collections::{HashMap, HashSet};
4use strsim::jaro_winkler;
5
6use crate::models::Paper;
7
8/// Strategy for handling duplicates
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum DuplicateStrategy {
11    /// Keep the first occurrence of each duplicate group
12    First,
13    /// Keep the last occurrence of each duplicate group
14    Last,
15    /// Keep all papers but mark duplicates
16    Mark,
17}
18
19/// Find duplicate papers based on DOI, title similarity, and author+year
20///
21/// Returns groups of paper indices that are duplicates of each other
22pub fn find_duplicates(papers: &[Paper]) -> Vec<Vec<usize>> {
23    let mut groups: Vec<Vec<usize>> = Vec::new();
24    let mut processed: HashSet<usize> = HashSet::new();
25
26    for i in 0..papers.len() {
27        if processed.contains(&i) {
28            continue;
29        }
30
31        let mut group = vec![i];
32        let paper_i = &papers[i];
33
34        for (j, paper_j) in papers.iter().enumerate().skip(i + 1) {
35            if processed.contains(&j) {
36                continue;
37            }
38
39            // Check if papers are duplicates
40            if are_duplicates(paper_i, paper_j) {
41                group.push(j);
42                processed.insert(j);
43            }
44        }
45
46        if group.len() > 1 {
47            groups.push(group);
48        }
49
50        processed.insert(i);
51    }
52
53    groups
54}
55
56/// Check if two papers are likely duplicates
57fn are_duplicates(a: &Paper, b: &Paper) -> bool {
58    // Same source means they're not duplicates
59    if a.source == b.source {
60        return false;
61    }
62
63    // Check DOI match (strongest signal)
64    if let (Some(doi_a), Some(doi_b)) = (&a.doi, &b.doi) {
65        if doi_a.to_lowercase() == doi_b.to_lowercase() {
66            return true;
67        }
68    }
69
70    // Check title similarity
71    let title_a = a.title.to_lowercase().trim().to_string();
72    let title_b = b.title.to_lowercase().trim().to_string();
73
74    let title_similarity = jaro_winkler(&title_a, &title_b);
75
76    // High title similarity (0.95+ threshold)
77    if title_similarity >= 0.95 {
78        // Also check authors match approximately
79        if authors_match(a, b) {
80            return true;
81        }
82    }
83
84    // Check exact title match after cleaning
85    if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
86        return true;
87    }
88
89    false
90}
91
92/// Check if authors approximately match
93fn authors_match(a: &Paper, b: &Paper) -> bool {
94    let authors_a: HashSet<String> = a
95        .author_list()
96        .iter()
97        .map(|s| s.to_lowercase().trim().to_string())
98        .collect();
99    let authors_b: HashSet<String> = b
100        .author_list()
101        .iter()
102        .map(|s| s.to_lowercase().trim().to_string())
103        .collect();
104
105    // If one has no authors, can't compare
106    if authors_a.is_empty() || authors_b.is_empty() {
107        return true; // Assume match if author info is missing
108    }
109
110    // Check if at least one author matches
111    authors_a.intersection(&authors_b).count() > 0
112}
113
114/// Normalize a title for comparison
115fn normalize_title(title: &str) -> String {
116    title
117        .chars()
118        .filter(|c| c.is_alphanumeric() || c.is_whitespace())
119        .collect::<String>()
120        .split_whitespace()
121        .collect::<Vec<_>>()
122        .join(" ")
123}
124
125/// Remove duplicate papers from a list
126///
127/// # Arguments
128/// * `papers` - The papers to deduplicate
129/// * `strategy` - How to handle duplicates (keep first, last, or mark)
130///
131/// # Returns
132/// A deduplicated list of papers
133pub fn deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
134    let groups = find_duplicates(&papers);
135
136    if groups.is_empty() {
137        return papers;
138    }
139
140    let mut to_remove: HashSet<usize> = HashSet::new();
141
142    for group in groups {
143        match strategy {
144            DuplicateStrategy::First => {
145                // Keep first, remove rest
146                for idx in group.iter().skip(1) {
147                    to_remove.insert(*idx);
148                }
149            }
150            DuplicateStrategy::Last => {
151                // Keep last, remove rest
152                for idx in group.iter().take(group.len() - 1) {
153                    to_remove.insert(*idx);
154                }
155            }
156            DuplicateStrategy::Mark => {
157                // Don't remove any, just return as-is
158                // In a real implementation, you might add a "duplicate" field
159            }
160        }
161    }
162
163    if to_remove.is_empty() {
164        return papers;
165    }
166
167    papers
168        .into_iter()
169        .enumerate()
170        .filter(|(i, _)| !to_remove.contains(i))
171        .map(|(_, p)| p)
172        .collect()
173}
174
175/// Fast hash-based deduplication for papers
176///
177/// Uses a two-pass algorithm for O(n) complexity on exact matches:
178/// 1. First pass: Hash-based matching for DOIs and normalized titles (O(n))
179/// 2. Second pass: Similarity check only for papers not matched by hash
180///
181/// This is significantly faster than the O(n²) similarity-only approach
182/// for large paper lists with many exact matches.
183pub fn fast_deduplicate_papers(papers: Vec<Paper>, strategy: DuplicateStrategy) -> Vec<Paper> {
184    if papers.len() <= 1 {
185        return papers;
186    }
187
188    // Maps for O(1) lookups
189    let mut doi_map: HashMap<String, Vec<usize>> = HashMap::new();
190    let mut title_map: HashMap<String, Vec<usize>> = HashMap::new();
191
192    // First pass: build hash maps (O(n))
193    for (idx, paper) in papers.iter().enumerate() {
194        // Index by lowercase DOI
195        if let Some(ref doi) = paper.doi {
196            let doi_key = doi.to_lowercase();
197            doi_map.entry(doi_key).or_default().push(idx);
198        }
199
200        // Index by normalized title
201        let normalized = normalize_title(&paper.title.to_lowercase());
202        title_map.entry(normalized).or_default().push(idx);
203    }
204
205    // Track duplicates using a HashSet for O(1) lookups
206    let mut duplicates: HashSet<usize> = HashSet::new();
207
208    // Process DOI matches (strongest signal) - O(n)
209    for (_, indices) in doi_map.into_iter() {
210        if indices.len() > 1 {
211            match strategy {
212                DuplicateStrategy::First => {
213                    for idx in indices.iter().skip(1) {
214                        duplicates.insert(*idx);
215                    }
216                }
217                DuplicateStrategy::Last => {
218                    for idx in indices.iter().take(indices.len() - 1) {
219                        duplicates.insert(*idx);
220                    }
221                }
222                DuplicateStrategy::Mark => {
223                    // Keep all
224                }
225            }
226        }
227    }
228
229    // Process title matches for papers not already marked as DOI duplicates
230    // Only compare papers from different sources to avoid false positives
231    for (_, indices) in title_map.into_iter() {
232        if indices.len() > 1 {
233            let mut to_mark: Vec<usize> = Vec::new();
234
235            // Check each pair
236            for i in 0..indices.len() {
237                if duplicates.contains(&indices[i]) {
238                    continue;
239                }
240
241                for j in (i + 1)..indices.len() {
242                    if duplicates.contains(&indices[j]) {
243                        continue;
244                    }
245
246                    let paper_i = &papers[indices[i]];
247                    let paper_j = &papers[indices[j]];
248
249                    // Skip same source
250                    if paper_i.source == paper_j.source {
251                        continue;
252                    }
253
254                    // Check if already marked as DOI duplicate
255                    if let (Some(doi_i), Some(doi_j)) = (&paper_i.doi, &paper_j.doi) {
256                        if doi_i.to_lowercase() == doi_j.to_lowercase() {
257                            continue; // Already handled by DOI matching
258                        }
259                    }
260
261                    // Additional similarity check for confidence
262                    if title_similarity_confidence(paper_i, paper_j) {
263                        match strategy {
264                            DuplicateStrategy::First => to_mark.push(indices[j]),
265                            DuplicateStrategy::Last => to_mark.push(indices[i]),
266                            DuplicateStrategy::Mark => {}
267                        }
268                    }
269                }
270            }
271
272            // Add to duplicates
273            for idx in to_mark {
274                duplicates.insert(idx);
275            }
276        }
277    }
278
279    // Filter papers
280    papers
281        .into_iter()
282        .enumerate()
283        .filter(|(i, _)| !duplicates.contains(i))
284        .map(|(_, p)| p)
285        .collect()
286}
287
288/// Calculate confidence that two papers are the same based on multiple signals
289fn title_similarity_confidence(a: &Paper, b: &Paper) -> bool {
290    // High title similarity
291    let title_a = a.title.to_lowercase().trim().to_string();
292    let title_b = b.title.to_lowercase().trim().to_string();
293    let similarity = jaro_winkler(&title_a, &title_b);
294
295    if similarity >= 0.95 && authors_match(a, b) {
296        return true;
297    }
298
299    // Exact normalized title match with author overlap
300    if normalize_title(&title_a) == normalize_title(&title_b) && authors_match(a, b) {
301        return true;
302    }
303
304    false
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310    use crate::models::{PaperBuilder, SourceType};
311
312    #[test]
313    fn test_normalize_title() {
314        assert_eq!(normalize_title("Hello, World!"), "Hello World");
315        assert_eq!(normalize_title("Test   Title"), "Test Title");
316        assert_eq!(normalize_title("Test: A-B/C"), "Test ABC");
317        assert_eq!(normalize_title(""), "");
318        assert_eq!(normalize_title("   "), "");
319    }
320
321    #[test]
322    fn test_deduplicate_by_doi() {
323        let papers = vec![
324            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
325                .doi("10.1234/test")
326                .build(),
327            PaperBuilder::new(
328                "2",
329                "Test Paper",
330                "https://semantic.org/2",
331                SourceType::SemanticScholar,
332            )
333            .doi("10.1234/test")
334            .build(),
335        ];
336
337        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
338        assert_eq!(deduped.len(), 1);
339        assert_eq!(deduped[0].paper_id, "1");
340    }
341
342    #[test]
343    fn test_deduplicate_by_doi_case_insensitive() {
344        let papers = vec![
345            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
346                .doi("10.1234/TEST")
347                .build(),
348            PaperBuilder::new(
349                "2",
350                "Test Paper",
351                "https://semantic.org/2",
352                SourceType::SemanticScholar,
353            )
354            .doi("10.1234/test")
355            .build(),
356        ];
357
358        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
359        assert_eq!(deduped.len(), 1);
360    }
361
362    #[test]
363    fn test_deduplicate_by_title() {
364        let papers = vec![
365            PaperBuilder::new(
366                "1",
367                "Machine Learning for Cats",
368                "https://arxiv.org/1",
369                SourceType::Arxiv,
370            )
371            .authors("John Doe")
372            .build(),
373            PaperBuilder::new(
374                "2",
375                "Machine Learning for Cats",
376                "https://semantic.org/2",
377                SourceType::SemanticScholar,
378            )
379            .authors("John Doe; Jane Smith")
380            .build(),
381        ];
382
383        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
384        assert_eq!(deduped.len(), 1);
385    }
386
387    #[test]
388    fn test_deduplicate_keep_last() {
389        let papers = vec![
390            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
391                .doi("10.1234/test")
392                .build(),
393            PaperBuilder::new(
394                "2",
395                "Test Paper",
396                "https://semantic.org/2",
397                SourceType::SemanticScholar,
398            )
399            .doi("10.1234/test")
400            .build(),
401        ];
402
403        let deduped = deduplicate_papers(papers, DuplicateStrategy::Last);
404        assert_eq!(deduped.len(), 1);
405        assert_eq!(deduped[0].paper_id, "2");
406    }
407
408    #[test]
409    fn test_deduplicate_mark_strategy() {
410        let papers = vec![
411            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
412                .doi("10.1234/test")
413                .build(),
414            PaperBuilder::new(
415                "2",
416                "Test Paper",
417                "https://semantic.org/2",
418                SourceType::SemanticScholar,
419            )
420            .doi("10.1234/test")
421            .build(),
422        ];
423
424        let deduped = deduplicate_papers(papers, DuplicateStrategy::Mark);
425        // Mark strategy should keep all papers
426        assert_eq!(deduped.len(), 2);
427    }
428
429    #[test]
430    fn test_no_duplicates_same_source() {
431        let papers = vec![
432            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
433            PaperBuilder::new("2", "Test Paper", "https://arxiv.org/2", SourceType::Arxiv).build(),
434        ];
435
436        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
437        assert_eq!(deduped.len(), 2);
438    }
439
440    #[test]
441    fn test_no_duplicates_different_titles() {
442        let papers = vec![
443            PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
444                .authors("John Doe")
445                .build(),
446            PaperBuilder::new(
447                "2",
448                "Paper B",
449                "https://semantic.org/2",
450                SourceType::SemanticScholar,
451            )
452            .authors("John Doe")
453            .build(),
454        ];
455
456        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
457        assert_eq!(deduped.len(), 2);
458    }
459
460    #[test]
461    fn test_no_duplicates_no_common_authors() {
462        let papers = vec![
463            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
464                .authors("John Doe")
465                .build(),
466            PaperBuilder::new(
467                "2",
468                "Test Paper",
469                "https://semantic.org/2",
470                SourceType::SemanticScholar,
471            )
472            .authors("Jane Smith")
473            .build(),
474        ];
475
476        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
477        assert_eq!(deduped.len(), 2);
478    }
479
480    #[test]
481    fn test_deduplicate_empty_list() {
482        let papers = vec![];
483        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
484        assert_eq!(deduped.len(), 0);
485    }
486
487    #[test]
488    fn test_deduplicate_single_paper() {
489        let papers =
490            vec![
491                PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
492                    .build(),
493            ];
494
495        let deduped = deduplicate_papers(papers, DuplicateStrategy::First);
496        assert_eq!(deduped.len(), 1);
497    }
498
499    #[test]
500    fn test_find_duplicates() {
501        let papers = vec![
502            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
503                .doi("10.1234/test")
504                .build(),
505            PaperBuilder::new(
506                "2",
507                "Test Paper",
508                "https://semantic.org/2",
509                SourceType::SemanticScholar,
510            )
511            .doi("10.1234/test")
512            .build(),
513            PaperBuilder::new("3", "Other Paper", "https://arxiv.org/3", SourceType::Arxiv).build(),
514        ];
515
516        let groups = find_duplicates(&papers);
517        assert_eq!(groups.len(), 1);
518        assert_eq!(groups[0], vec![0, 1]);
519    }
520
521    #[test]
522    fn test_find_duplicates_empty() {
523        let papers = vec![];
524        let groups = find_duplicates(&papers);
525        assert_eq!(groups.len(), 0);
526    }
527
528    #[test]
529    fn test_authors_match_no_authors() {
530        let papers = vec![
531            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv).build(),
532            PaperBuilder::new(
533                "2",
534                "Test Paper",
535                "https://semantic.org/2",
536                SourceType::SemanticScholar,
537            )
538            .build(),
539        ];
540
541        let deduped = deduplicate_papers(papers.clone(), DuplicateStrategy::First);
542        // Without authors, should still match on title
543        assert_eq!(deduped.len(), 1);
544    }
545
546    // Tests for fast_deduplicate_papers
547
548    #[test]
549    fn test_fast_deduplicate_by_doi() {
550        let papers = vec![
551            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
552                .doi("10.1234/test")
553                .build(),
554            PaperBuilder::new(
555                "2",
556                "Test Paper",
557                "https://semantic.org/2",
558                SourceType::SemanticScholar,
559            )
560            .doi("10.1234/test")
561            .build(),
562        ];
563
564        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
565        assert_eq!(deduped.len(), 1);
566        assert_eq!(deduped[0].paper_id, "1");
567    }
568
569    #[test]
570    fn test_fast_deduplicate_by_title() {
571        let papers = vec![
572            PaperBuilder::new(
573                "1",
574                "Machine Learning for Cats",
575                "https://arxiv.org/1",
576                SourceType::Arxiv,
577            )
578            .authors("John Doe")
579            .build(),
580            PaperBuilder::new(
581                "2",
582                "Machine Learning for Cats",
583                "https://semantic.org/2",
584                SourceType::SemanticScholar,
585            )
586            .authors("John Doe; Jane Smith")
587            .build(),
588        ];
589
590        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
591        assert_eq!(deduped.len(), 1);
592    }
593
594    #[test]
595    fn test_fast_deduplicate_keep_last() {
596        let papers = vec![
597            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
598                .doi("10.1234/test")
599                .build(),
600            PaperBuilder::new(
601                "2",
602                "Test Paper",
603                "https://semantic.org/2",
604                SourceType::SemanticScholar,
605            )
606            .doi("10.1234/test")
607            .build(),
608        ];
609
610        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::Last);
611        assert_eq!(deduped.len(), 1);
612        assert_eq!(deduped[0].paper_id, "2");
613    }
614
615    #[test]
616    fn test_fast_deduplicate_empty() {
617        let papers = vec![];
618        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
619        assert_eq!(deduped.len(), 0);
620    }
621
622    #[test]
623    fn test_fast_deduplicate_single() {
624        let papers =
625            vec![
626                PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
627                    .build(),
628            ];
629
630        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
631        assert_eq!(deduped.len(), 1);
632    }
633
634    #[test]
635    fn test_fast_no_duplicates_different_titles() {
636        let papers = vec![
637            PaperBuilder::new("1", "Paper A", "https://arxiv.org/1", SourceType::Arxiv)
638                .authors("John Doe")
639                .build(),
640            PaperBuilder::new(
641                "2",
642                "Paper B",
643                "https://semantic.org/2",
644                SourceType::SemanticScholar,
645            )
646            .authors("John Doe")
647            .build(),
648        ];
649
650        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
651        assert_eq!(deduped.len(), 2);
652    }
653
654    #[test]
655    fn test_fast_deduplicate_multiple_sources() {
656        let papers = vec![
657            PaperBuilder::new("1", "Test Paper", "https://arxiv.org/1", SourceType::Arxiv)
658                .doi("10.1234/test")
659                .build(),
660            PaperBuilder::new(
661                "2",
662                "Test Paper",
663                "https://semantic.org/2",
664                SourceType::SemanticScholar,
665            )
666            .doi("10.1234/test")
667            .build(),
668            PaperBuilder::new(
669                "3",
670                "Test Paper",
671                "https://openalex.org/3",
672                SourceType::OpenAlex,
673            )
674            .doi("10.1234/test")
675            .build(),
676        ];
677
678        let deduped = fast_deduplicate_papers(papers, DuplicateStrategy::First);
679        assert_eq!(deduped.len(), 1);
680        assert_eq!(deduped[0].paper_id, "1");
681    }
682}