biblib/
dedupe.rs

1//! Citations deduplicator implementation.
2//!
3//! A module for detecting duplicate academic citations. It provides robust
4//! deduplication of citations based on multiple criteria including DOIs, titles, journal names,
5//! and other metadata.
6//!
7//! ## Features
8//!
9//! - Flexible deduplication based on multiple citation fields
10//! - Smart matching of journal names and abbreviations
11//! - Support for DOI and non-DOI based citations
12//! - Optional year-based grouping for improved performance
13//! - Parallel processing support
14//! - Unicode character handling
15//! - Configurable matching thresholds
16//! - Source-aware deduplication with preferences
17//!
18//! ## Usage
19//!
20//! ### Basic Deduplication
21//!
22//! ```rust
23//! use biblib::{dedupe::Deduplicator, Citation, Author, Date};
24//!
25//! // Create some sample citations
26//! let citations = vec![
27//!     Citation {
28//!         title: "Machine Learning Basics".to_string(),
29//!         authors: vec![
30//!             Author {
31//!                 name: "Smith".to_string(),
32//!                 given_name: Some("John".to_string()),
33//!                 middle_name: None,
34//!                 affiliations: vec![],
35//!             }
36//!         ],
37//!         doi: Some("10.1234/ml.2023.001".to_string()),
38//!         date: Some(Date { year: 2023, month: None, day: None }),
39//!         ..Default::default()
40//!     },
41//!     // Duplicate citation with slightly different title
42//!     Citation {
43//!         title: "Machine Learning Basics.".to_string(), // Notice the period
44//!         authors: vec![
45//!             Author {
46//!                 name: "Smith".to_string(),
47//!                 given_name: Some("John".to_string()),
48//!                 middle_name: None,
49//!                 affiliations: vec![],
50//!             }
51//!         ],
52//!         doi: Some("10.1234/ml.2023.001".to_string()),
53//!         date: Some(Date { year: 2023, month: None, day: None }),
54//!         ..Default::default()
55//!     },
56//! ];
57//!
58//! // Create a deduplicator with default settings
59//! let deduplicator = Deduplicator::new();
60//!
61//! // Find duplicate citations
62//! let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
63//!
64//! // Process results
65//! for group in duplicate_groups {
66//!     println!("Original: {}", group.unique.title);
67//!     for duplicate in group.duplicates {
68//!         println!("  Duplicate: {}", duplicate.title);
69//!     }
70//! }
71//! ```
72//!
73//! ### Deduplication with Source Preferences
74//!
75//! ```rust
76//! use biblib::{dedupe::Deduplicator, Citation};
77//!
78//! let citations = vec![
79//!     Citation {
80//!         title: "Example Title".to_string(),
81//!         doi: Some("10.1234/example".to_string()),
82//!         ..Default::default()
83//!     },
84//!     Citation {
85//!         title: "Example Title".to_string(),
86//!         doi: Some("10.1234/example".to_string()),
87//!         ..Default::default()
88//!     },
89//! ];
90//!
91//! // Sources corresponding to each citation
92//! let sources = vec!["Embase", "PubMed"];
93//!
94//! let config = biblib::dedupe::DeduplicatorConfig {
95//!     source_preferences: vec!["PubMed".to_string(), "Embase".to_string()],
96//!     ..Default::default()
97//! };
98//!
99//! let deduplicator = Deduplicator::new().with_config(config);
100//! let duplicate_groups = deduplicator.find_duplicates_with_sources(&citations, &sources).unwrap();
101//!
102//! // The PubMed citation will be selected as the unique citation
103//! ```
104//!
105//! ## Advanced Configuration
106//!
107//! The deduplicator can be configured with custom settings:
108//!
109//! ```rust
110//! use biblib::dedupe::{Deduplicator, DeduplicatorConfig};
111//!
112//! let config = DeduplicatorConfig {
113//!     group_by_year: false,     // Disable year-based grouping
114//!     run_in_parallel: true,    // Enable parallel processing
115//!     source_preferences: vec!["PubMed".to_string(), "CrossRef".to_string()],
116//! };
117//!
118//! let deduplicator = Deduplicator::new().with_config(config);
119//! ```
120//!
121//! ## Matching Criteria
122//!
123//! Citations are considered duplicates based on the following criteria:
124//!
125//! 1. With DOIs:
126//!    - Matching DOIs and high title similarity (≥ 0.85)
127//!    - Matching journal names or ISSNs
128//!
129//! 2. Without DOIs:
130//!    - Very high title similarity (≥ 0.93)
131//!    - Matching volume or page numbers
132//!    - Matching journal names or ISSNs
133
134use crate::regex::Regex;
135use crate::{Citation, DuplicateGroup};
136use std::collections::HashMap;
137use std::sync::LazyLock;
138use strsim::jaro;
139use strsim::jaro_winkler;
140
141const DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.85;
142const NO_DOI_TITLE_SIMILARITY_THRESHOLD: f64 = 0.93;
143
144static UNICODE_REGEX: LazyLock<Regex> =
145    LazyLock::new(|| Regex::new(r"<U\+([0-9A-Fa-f]+)>").unwrap());
146
147const HTML_REPLACEMENTS: [(&str, &str); 13] = [
148    ("&lt;", "<"),
149    ("&gt;", ">"),
150    ("<sup>", ""),
151    ("</sup>", ""),
152    ("<sub>", ""),
153    ("</sub>", ""),
154    ("<inf>", ""),
155    ("</inf>", ""),
156    ("beta", "b"),
157    ("alpha", "a"),
158    ("α", "a"),
159    ("ß", "b"),
160    ("γ", "g"),
161];
162
163/// Configuration options for controlling the deduplication process.
164///
165/// This struct allows fine-tuning of the deduplication algorithm's behavior
166/// through various options. The main settings control grouping strategy and
167/// parallel processing capabilities.
168///
169/// # Examples
170///
171/// ```
172/// use biblib::dedupe::DeduplicatorConfig;
173///
174/// let config = DeduplicatorConfig {
175///     group_by_year: true,    // Enable year-based grouping
176///     run_in_parallel: true,  // Enable parallel processing
177///     source_preferences: vec!["PubMed".to_string(), "Google Scholar      ".to_string()],
178/// };
179/// ```
180///
181/// # Performance Impact
182///
183/// - `group_by_year`: Significant performance improvement for large datasets
184/// - `run_in_parallel`: Most effective when used with year grouping
185///
186/// # Notes
187///
188/// - When `group_by_year` is false, `run_in_parallel` is automatically disabled
189/// - Year grouping is recommended for datasets with > 1000 citations
190#[derive(Debug, Default, Clone)]
191pub struct DeduplicatorConfig {
192    /// Whether to group citations by year before processing.
193    /// This can significantly improve performance for large datasets.
194    pub group_by_year: bool,
195    /// Whether to use parallel processing for year groups.
196    /// Most effective when combined with `group_by_year = true`.
197    pub run_in_parallel: bool,
198    /// Ordered list of preferred sources for unique citations.
199    /// First source in the list has highest priority.
200    pub source_preferences: Vec<String>,
201}
202
203/// Core deduplication engine for finding duplicate citations.
204///
205/// The deduplicator uses a sophisticated algorithm to identify duplicate citations
206/// based on multiple criteria including DOIs, titles, and other metadata. It supports
207/// both exact and fuzzy matching with configurable thresholds.
208///
209/// # Algorithm
210///
211/// Citations are considered duplicates based on these criteria:
212///
213/// 1. **With DOIs**:
214///    - Matching DOIs and high title similarity (≥ 0.85)
215///    - Matching journal names or ISSNs
216///
217/// 2. **Without DOIs**:
218///    - Very high title similarity (≥ 0.93)
219///    - Matching volume/pages
220///    - Matching journal names/ISSNs
221///
222/// # Examples
223///
224/// ```
225/// use biblib::dedupe::{Deduplicator, DeduplicatorConfig};
226///
227/// // Create with default settings
228///
229/// // Or with custom configuration
230/// let config = DeduplicatorConfig {
231///     group_by_year: true,
232///     run_in_parallel: true,
233///     source_preferences: vec!["PubMed".to_string(), "Embase".to_string()],
234/// };
235/// let deduplicator = Deduplicator::new().with_config(config);
236/// ```
237///
238/// # Performance
239///
240/// - Time complexity: O(n²) without year grouping
241/// - With year grouping: O(Σ n_y²) where n_y is citations per year
242/// - Parallel processing available when using year grouping
243#[derive(Debug, Default, Clone)]
244pub struct Deduplicator {
245    config: DeduplicatorConfig,
246}
247
248#[derive(Debug)]
249struct PreprocessedCitation<'a> {
250    original: &'a Citation,
251    normalized_title: String,
252    normalized_journal: Option<String>,
253    normalized_journal_abbr: Option<String>,
254    normalized_issn: Vec<String>,
255    normalized_volume: String,
256}
257
258/// Error types for dedupe operations
259#[derive(Debug, thiserror::Error)]
260pub enum DedupeError {
261    #[error("Invalid citation data: {0}")]
262    InvalidCitation(String),
263
264    #[error("Processing error: {0}")]
265    ProcessingError(String),
266
267    #[error("Configuration error: {0}")]
268    ConfigError(String),
269}
270
271impl Deduplicator {
272    /// Creates a new Deduplicator with default configuration.
273    ///
274    /// Default configuration enables year-based grouping and disables parallel processing.
275    ///
276    /// # Examples
277    ///
278    /// ```
279    /// use biblib::dedupe::Deduplicator;
280    ///
281    /// let deduplicator = Deduplicator::new();
282    /// ```
283    #[must_use]
284    pub fn new() -> Self {
285        Self {
286            config: DeduplicatorConfig {
287                group_by_year: true,
288                run_in_parallel: false,
289                source_preferences: Vec::new(),
290            },
291        }
292    }
293
294    /// Creates a new Deduplicator with custom configuration.
295    ///
296    /// # Notes
297    ///
298    /// - Disabling year-based grouping can result in very long processing times.
299    /// - Parallel processing (`run_in_parallel`) is only effective when `group_by_year` is `true`.
300    /// - If `run_in_parallel` is `true` but `group_by_year` is `false`, `run_in_parallel` will be ignored.
301    ///
302    /// # Examples
303    ///
304    /// ```
305    /// use biblib::dedupe::{Deduplicator, DeduplicatorConfig};
306    ///
307    /// let config = DeduplicatorConfig {
308    ///     group_by_year: true,
309    ///     run_in_parallel: true,
310    ///     source_preferences: vec!["PubMed".to_string(), "Google Scholar".to_string()],   
311    /// };
312    /// let deduplicator = Deduplicator::new().with_config(config);
313    /// ```
314    #[must_use]
315    pub fn with_config(mut self, mut config: DeduplicatorConfig) -> Self {
316        // Disable parallel processing if not grouping by year
317        if !config.group_by_year {
318            config.run_in_parallel = false;
319        }
320        self.config = config;
321        self
322    }
323
324    /// Processes a list of citations and returns groups of duplicates.
325    ///
326    /// This method analyzes the provided citations and groups them based on
327    /// similarity criteria including DOIs, titles, and other metadata.
328    /// One citation in each group is designated as the unique (original) citation.
329    ///
330    /// # Arguments
331    ///
332    /// * `citations` - A slice of Citation objects to be analyzed
333    ///
334    /// # Returns
335    ///
336    /// Returns a vector of `DuplicateGroup`s, where each group contains
337    /// one unique citation and its identified duplicates.
338    ///
339    /// # Examples
340    ///
341    /// ```
342    /// use biblib::{dedupe::Deduplicator, Citation};
343    ///
344    /// let citations = vec![
345    ///     Citation {
346    ///         title: "Example Title".to_string(),
347    ///         doi: Some("10.1234/example".to_string()),
348    ///         ..Default::default()
349    ///     },
350    ///     // ... more citations ...
351    /// ];
352    ///
353    /// let deduplicator = Deduplicator::new();
354    /// let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
355    /// ```
356    pub fn find_duplicates(
357        self,
358        citations: &[Citation],
359    ) -> Result<Vec<DuplicateGroup>, DedupeError> {
360        self.find_duplicates_with_sources(citations, &[])
361    }
362
363    /// Processes citations with their source information and returns groups of duplicates.
364    ///
365    /// This method is similar to `find_duplicates` but allows you to specify source
366    /// information for each citation, enabling source-based preferences during deduplication.
367    /// Citations without corresponding source entries are treated as having no source.
368    ///
369    /// # Arguments
370    ///
371    /// * `citations` - A slice of Citation objects to be analyzed
372    /// * `sources` - A slice of source names corresponding to each citation.
373    ///   If shorter than citations, remaining citations have no source.
374    ///
375    /// # Returns
376    ///
377    /// Returns a vector of `DuplicateGroup`s, where each group contains
378    /// one unique citation and its identified duplicates.
379    ///
380    /// # Examples
381    ///
382    /// ```
383    /// use biblib::{dedupe::Deduplicator, Citation};
384    ///
385    /// let citations = vec![
386    ///     Citation {
387    ///         title: "Example Title".to_string(),
388    ///         doi: Some("10.1234/example".to_string()),
389    ///         ..Default::default()
390    ///     },
391    ///     Citation {
392    ///         title: "Example Title".to_string(),
393    ///         doi: Some("10.1234/example".to_string()),
394    ///         ..Default::default()
395    ///     },
396    /// ];
397    ///
398    /// let sources = vec!["PubMed", "CrossRef"];
399    ///
400    /// let deduplicator = Deduplicator::new();
401    /// let duplicate_groups = deduplicator.find_duplicates_with_sources(&citations, &sources).unwrap();
402    /// ```
403    pub fn find_duplicates_with_sources(
404        self,
405        citations: &[Citation],
406        sources: &[&str],
407    ) -> Result<Vec<DuplicateGroup>, DedupeError> {
408        if citations.is_empty() {
409            return Ok(Vec::new());
410        }
411
412        // Validate input - warn if sources length exceeds citations
413        if sources.len() > citations.len() {
414            return Err(DedupeError::ConfigError(format!(
415                "Number of sources ({}) exceeds number of citations ({}). Each source must correspond to a citation.",
416                sources.len(),
417                citations.len()
418            )));
419        }
420
421        // Create source mapping using citation indices instead of IDs
422        let source_map: HashMap<usize, Option<&str>> = citations
423            .iter()
424            .enumerate()
425            .zip(
426                sources
427                    .iter()
428                    .map(|&s| Some(s))
429                    .chain(std::iter::repeat(None)),
430            )
431            .map(|((idx, _citation), source)| (idx, source))
432            .collect();
433
434        // Create global mapping from citation pointers to original indices
435        let global_ptr_to_index: HashMap<*const Citation, usize> = citations
436            .iter()
437            .enumerate()
438            .map(|(i, citation)| (citation as *const Citation, i))
439            .collect();
440
441        if self.config.group_by_year {
442            let year_groups = Self::group_by_year_with_indices(citations);
443            if self.config.run_in_parallel {
444                use rayon::prelude::*;
445
446                let duplicate_groups: Result<Vec<_>, _> = year_groups
447                    .par_iter()
448                    .map(|(_, citations_with_indices)| {
449                        let citations_in_year: Vec<&Citation> = citations_with_indices
450                            .iter()
451                            .map(|(citation, _)| *citation)
452                            .collect();
453                        // Create a local mapping for this year group
454                        let local_to_global: HashMap<*const Citation, usize> =
455                            citations_with_indices
456                                .iter()
457                                .map(|(citation, global_idx)| {
458                                    (*citation as *const Citation, *global_idx)
459                                })
460                                .collect();
461                        self.process_citation_group_with_sources(
462                            &citations_in_year,
463                            &source_map,
464                            &local_to_global,
465                        )
466                    })
467                    .collect();
468
469                // Flatten results
470                Ok(duplicate_groups?.into_iter().flatten().collect())
471            } else {
472                let mut duplicate_groups = Vec::new();
473
474                for citations_with_indices in year_groups.values() {
475                    let citations_in_year: Vec<&Citation> = citations_with_indices
476                        .iter()
477                        .map(|(citation, _)| *citation)
478                        .collect();
479                    // Create a local mapping for this year group
480                    let local_to_global: HashMap<*const Citation, usize> = citations_with_indices
481                        .iter()
482                        .map(|(citation, global_idx)| (*citation as *const Citation, *global_idx))
483                        .collect();
484                    duplicate_groups.extend(self.process_citation_group_with_sources(
485                        &citations_in_year,
486                        &source_map,
487                        &local_to_global,
488                    )?);
489                }
490                Ok(duplicate_groups)
491            }
492        } else {
493            let citations_refs: Vec<&Citation> = citations.iter().collect();
494            self.process_citation_group_with_sources(
495                &citations_refs,
496                &source_map,
497                &global_ptr_to_index,
498            )
499        }
500    }
501
502    /// Get the year from a citation.
503    fn get_citation_year(citation: &Citation) -> Option<i32> {
504        Self::get_citation_year_static(citation)
505    }
506
507    fn select_unique_citation<'a>(&self, citations: &[&'a Citation]) -> &'a Citation {
508        if citations.len() == 1 {
509            return citations[0];
510        }
511
512        // If no source preference matches, prefer citations with abstracts
513        let citations_with_abstract: Vec<_> = citations
514            .iter()
515            .filter(|c| c.abstract_text.is_some())
516            .collect();
517
518        match citations_with_abstract.len() {
519            0 => citations[0],               // If no abstracts, use first citation
520            1 => citations_with_abstract[0], // If one abstract, use that
521            _ => {
522                // Multiple abstracts, prefer ones with DOI
523                let with_doi = citations_with_abstract
524                    .iter()
525                    .find(|c| c.doi.as_ref().is_some_and(|d| !d.is_empty()));
526
527                with_doi.copied().unwrap_or(citations_with_abstract[0])
528            }
529        }
530    }
531
532    fn select_unique_citation_with_sources<'a>(
533        &self,
534        citations: &[&'a Citation],
535        citation_indices: &[usize],
536        source_map: &HashMap<usize, Option<&str>>,
537    ) -> &'a Citation {
538        if citations.len() == 1 {
539            return citations[0];
540        }
541
542        // First try source preferences
543        if !self.config.source_preferences.is_empty() {
544            for preferred_source in &self.config.source_preferences {
545                for (citation, &idx) in citations.iter().zip(citation_indices.iter()) {
546                    if source_map.get(&idx) == Some(&Some(preferred_source.as_str())) {
547                        return citation;
548                    }
549                }
550            }
551        }
552
553        // If no source preference matches, use the standard selection logic
554        self.select_unique_citation(citations)
555    }
556
557    fn process_citation_group_with_sources(
558        &self,
559        citations: &[&Citation],
560        source_map: &HashMap<usize, Option<&str>>,
561        global_ptr_to_index: &HashMap<*const Citation, usize>,
562    ) -> Result<Vec<DuplicateGroup>, DedupeError> {
563        let mut duplicate_groups = Vec::new();
564
565        // Preprocess all citations in this group
566        let preprocessed: Vec<PreprocessedCitation> = citations
567            .iter()
568            .map(|c| {
569                Ok(PreprocessedCitation {
570                    original: c,
571                    normalized_title: Self::normalize_string(&Self::convert_unicode_string(
572                        &c.title,
573                    ))
574                    .ok_or_else(|| {
575                        DedupeError::ProcessingError("Failed to normalize title".to_string())
576                    })?,
577                    normalized_journal: Self::format_journal_name(c.journal.as_deref()),
578                    normalized_journal_abbr: Self::format_journal_name(c.journal_abbr.as_deref()),
579                    normalized_volume: c
580                        .volume
581                        .as_deref()
582                        .map_or(String::new(), Deduplicator::normalize_volume),
583                    normalized_issn: c
584                        .issn
585                        .iter()
586                        .filter_map(|issn| Deduplicator::format_issn(issn))
587                        .collect(),
588                })
589            })
590            .collect::<Result<Vec<_>, _>>()?;
591
592        let mut processed_indices = std::collections::HashSet::new();
593
594        for i in 0..preprocessed.len() {
595            if processed_indices.contains(&i) {
596                continue;
597            }
598
599            let mut group_citations = vec![preprocessed[i].original];
600            let mut group_indices = vec![i];
601            let current = &preprocessed[i];
602
603            for (j, other) in preprocessed.iter().enumerate() {
604                if i == j || processed_indices.contains(&j) {
605                    continue;
606                }
607
608                let journal_match = Self::journals_match(
609                    &current.normalized_journal,
610                    &current.normalized_journal_abbr,
611                    &other.normalized_journal,
612                    &other.normalized_journal_abbr,
613                );
614                let issns_match =
615                    Self::match_issns(&current.normalized_issn, &other.normalized_issn);
616                let volumes_match = !current.normalized_volume.is_empty()
617                    && !other.normalized_volume.is_empty()
618                    && current.normalized_volume == other.normalized_volume;
619                let pages_match = current.original.pages.is_some()
620                    && other.original.pages.is_some()
621                    && current.original.pages == other.original.pages;
622                let years_match = Self::get_citation_year(current.original)
623                    == Self::get_citation_year(other.original);
624
625                let is_duplicate = match (&current.original.doi, &other.original.doi) {
626                    // With DOIs
627                    (Some(doi1), Some(doi2)) if !doi1.is_empty() && !doi2.is_empty() => {
628                        let title_similarity =
629                            jaro(&current.normalized_title, &other.normalized_title);
630
631                        // With Journal/ISSN match
632                        (doi1 == doi2 && title_similarity >= DOI_TITLE_SIMILARITY_THRESHOLD && (journal_match || issns_match))
633                        // Without Journal/ISSN match: only when we have same DOI (and we use volume/pages instead)
634                        || (doi1 == doi2 && title_similarity >= 0.99 && (volumes_match || pages_match))
635                        // Without DOI match: only when we have a very high title similarity and all other fields match
636                        || (title_similarity >= 0.99 && years_match && (volumes_match || pages_match) && (journal_match || issns_match))
637                    }
638                    // Without DOIs
639                    _ => {
640                        let title_similarity =
641                            jaro_winkler(&current.normalized_title, &other.normalized_title);
642
643                        // With Journal/ISSN match
644                        (title_similarity >= NO_DOI_TITLE_SIMILARITY_THRESHOLD && (volumes_match || pages_match) && (journal_match || issns_match))
645                        // Without Journal/ISSN match: only when we have a very high title similarity and all other fields match
646                        || (title_similarity >= 0.99 && years_match && (volumes_match && pages_match))
647                    }
648                };
649
650                if is_duplicate {
651                    group_citations.push(other.original);
652                    group_indices.push(j);
653                    processed_indices.insert(j);
654                }
655            }
656
657            if group_citations.len() > 1 {
658                // Convert citation indices to original indices for source lookup
659                let original_indices: Vec<usize> = group_indices
660                    .iter()
661                    .map(|&local_idx| {
662                        let citation_ptr = preprocessed[local_idx].original as *const Citation;
663                        global_ptr_to_index[&citation_ptr]
664                    })
665                    .collect();
666
667                let unique = self.select_unique_citation_with_sources(
668                    &group_citations,
669                    &original_indices,
670                    source_map,
671                );
672
673                let duplicates: Vec<Citation> = group_citations
674                    .into_iter()
675                    .filter(|c| !std::ptr::eq(*c, unique))
676                    .map(|c| (*c).clone())
677                    .collect();
678
679                duplicate_groups.push(DuplicateGroup {
680                    unique: unique.clone(),
681                    duplicates,
682                });
683                processed_indices.insert(i);
684            } else {
685                duplicate_groups.push(DuplicateGroup {
686                    unique: current.original.clone(),
687                    duplicates: Vec::new(),
688                });
689            }
690        }
691
692        Ok(duplicate_groups)
693    }
694
695    fn group_by_year_with_indices(citations: &[Citation]) -> HashMap<i32, Vec<(&Citation, usize)>> {
696        let mut year_map: HashMap<i32, Vec<(&Citation, usize)>> = HashMap::new();
697
698        // TODO: handle citations without a year when grouping by year
699        for (index, citation) in citations.iter().enumerate() {
700            let year = Self::get_citation_year_static(citation).unwrap_or(0);
701            year_map.entry(year).or_default().push((citation, index));
702        }
703
704        year_map
705    }
706    /// Static version of get_citation_year for use in static contexts
707    fn get_citation_year_static(citation: &Citation) -> Option<i32> {
708        citation.date.as_ref().map(|d| d.year)
709    }
710
711    fn convert_unicode_string(input: &str) -> String {
712        UNICODE_REGEX
713            .replace_all(input, |caps: &crate::regex::Captures| {
714                u32::from_str_radix(&caps[1], 16)
715                    .ok()
716                    .and_then(char::from_u32)
717                    .map(|c| c.to_string())
718                    .unwrap_or_else(|| caps[0].to_string())
719            })
720            .to_string()
721    }
722
723    fn normalize_string(string: &str) -> Option<String> {
724        if string.is_empty() {
725            return None;
726        }
727
728        let mut result = String::with_capacity(string.len());
729        let mut s = string.trim().to_lowercase();
730
731        for replacement in HTML_REPLACEMENTS.iter() {
732            s = s.replace(replacement.0, replacement.1);
733        }
734
735        s.chars()
736            .filter(|c| c.is_alphanumeric())
737            .for_each(|c| result.push(c));
738
739        Some(result)
740    }
741
742    fn normalize_volume(volume: &str) -> String {
743        if volume.is_empty() {
744            return String::new();
745        }
746
747        // Find first sequence of numbers anywhere in the string
748        let numbers: String = volume
749            .chars()
750            .skip_while(|c| !c.is_numeric())
751            .take_while(|c| c.is_numeric())
752            .collect();
753
754        if numbers.is_empty() {
755            String::new()
756        } else {
757            numbers
758        }
759    }
760
761    /// Check if two journals match by comparing both full name and abbreviation
762    fn journals_match(
763        journal1: &Option<String>,
764        journal_abbr1: &Option<String>,
765        journal2: &Option<String>,
766        journal_abbr2: &Option<String>,
767    ) -> bool {
768        journal1
769            .as_ref()
770            .zip(journal2.as_ref())
771            .is_some_and(|(j1, j2)| j1 == j2)
772            || journal_abbr1
773                .as_ref()
774                .zip(journal_abbr2.as_ref())
775                .is_some_and(|(a1, a2)| a1 == a2)
776            || journal1
777                .as_ref()
778                .zip(journal_abbr2.as_ref())
779                .is_some_and(|(j1, a2)| j1 == a2)
780            || journal_abbr1
781                .as_ref()
782                .zip(journal2.as_ref())
783                .is_some_and(|(a1, j2)| a1 == j2)
784    }
785
786    fn format_journal_name(full_name: Option<&str>) -> Option<String> {
787        full_name.map(|name| {
788            name.split(". Conference")
789                .next()
790                .unwrap_or(name)
791                .trim()
792                .to_lowercase()
793                .chars()
794                .filter(|c| c.is_alphanumeric())
795                .collect::<String>()
796        })
797    }
798
799    fn format_issn(issn_str: &str) -> Option<String> {
800        // Remove common suffixes and extra text
801        let clean_issn = issn_str
802            .trim()
803            .replace("(Electronic)", "")
804            .replace("(Linking)", "")
805            .replace("(Print)", "")
806            .replace(|c: char| !c.is_ascii_digit() && c != '-' && c != 'X', "")
807            .trim()
808            .to_string();
809
810        // Extract all digits and X
811        let digits: String = clean_issn
812            .chars()
813            .filter(|c| c.is_ascii_digit() || *c == 'X')
814            .collect();
815
816        // Validate format
817        match (clean_issn.len(), digits.len()) {
818            // Valid formats: "1234-5678" (9 chars with hyphen) or "12345678" (8 chars without hyphen)
819            (9, 8) if clean_issn.chars().nth(4) == Some('-') => Some(clean_issn),
820            (8, 8) => Some(format!("{}-{}", &digits[..4], &digits[4..])),
821            _ => None,
822        }
823    }
824
825    fn match_issns(list1: &[String], list2: &[String]) -> bool {
826        list1
827            .iter()
828            .any(|isbn1| list2.iter().any(|isbn2| isbn1 == isbn2))
829    }
830}
831
832#[cfg(test)]
833mod tests {
834    use super::*;
835
836    #[test]
837    fn test_group_by_year() {
838        let citations = vec![
839            Citation {
840                title: "Title 1".to_string(),
841                authors: vec![],
842                journal: None,
843                journal_abbr: None,
844                date: Some(crate::Date {
845                    year: 2020,
846                    month: None,
847                    day: None,
848                }),
849                volume: None,
850                abstract_text: None,
851                doi: None,
852                ..Default::default()
853            },
854            Citation {
855                title: "Title 2".to_string(),
856                authors: vec![],
857                journal: None,
858                journal_abbr: None,
859                date: None,
860                volume: None,
861                abstract_text: None,
862                doi: None,
863                ..Default::default()
864            },
865        ];
866
867        let grouped = Deduplicator::group_by_year_with_indices(&citations);
868        assert_eq!(grouped.get(&2020).unwrap().len(), 1);
869        assert_eq!(grouped.get(&0).unwrap().len(), 1);
870    }
871
872    #[test]
873    fn test_find_duplicates() {
874        let citations = vec![
875            Citation {
876                title: "Title 1".to_string(),
877                date: Some(crate::Date {
878                    year: 2020,
879                    month: None,
880                    day: None,
881                }),
882                doi: Some("10.1234/abc".to_string()),
883                journal: Some("Journal 1".to_string()),
884                ..Default::default()
885            },
886            Citation {
887                title: "Title 1".to_string(),
888                date: Some(crate::Date {
889                    year: 2020,
890                    month: None,
891                    day: None,
892                }),
893                doi: Some("10.1234/abc".to_string()),
894                journal: Some("Journal 1".to_string()),
895                ..Default::default()
896            },
897            Citation {
898                title: "Title 2".to_string(),
899                date: Some(crate::Date {
900                    year: 2020,
901                    month: None,
902                    day: None,
903                }),
904                doi: Some("10.1234/def".to_string()),
905                journal: Some("Journal 2".to_string()),
906                ..Default::default()
907            },
908        ];
909
910        let deduplicator = Deduplicator::new();
911        let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
912
913        assert_eq!(duplicate_groups.len(), 2);
914        assert_eq!(
915            duplicate_groups
916                .iter()
917                .find(|g| g.unique.doi == Some("10.1234/abc".to_string()))
918                .unwrap()
919                .duplicates
920                .len(),
921            1
922        );
923    }
924
925    #[test]
926    fn test_missing_doi() {
927        let citations = vec![
928            Citation {
929                title: "Title 1".to_string(),
930                date: Some(crate::Date {
931                    year: 2020,
932                    month: None,
933                    day: None,
934                }),
935                doi: Some("10.1234/abc".to_string()),
936                journal: Some("Journal 1".to_string()),
937                volume: Some("24".to_string()),
938                ..Default::default()
939            },
940            Citation {
941                title: "Title 1".to_string(),
942                date: Some(crate::Date {
943                    year: 2020,
944                    month: None,
945                    day: None,
946                }),
947                doi: Some("".to_string()),
948                journal: Some("Journal 1".to_string()),
949                volume: Some("24".to_string()),
950                ..Default::default()
951            },
952            Citation {
953                title: "Title 2".to_string(),
954                date: Some(crate::Date {
955                    year: 2020,
956                    month: None,
957                    day: None,
958                }),
959                doi: Some("".to_string()),
960                journal: Some("Journal 2".to_string()),
961                ..Default::default()
962            },
963        ];
964
965        let deduplicator = Deduplicator::new();
966        let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
967
968        assert_eq!(duplicate_groups.len(), 2);
969    }
970
971    #[test]
972    fn test_normalize_string() {
973        assert_eq!(
974            Deduplicator::normalize_string("Machine Learning! (2<sup>nd</sup> Edition)"),
975            Some("machinelearning2ndedition".to_string())
976        );
977        assert_eq!(
978            Deduplicator::normalize_string("[&lt;sup&gt;11&lt;/sup&gt;C] benzo"),
979            Some("11cbenzo".to_string())
980        );
981    }
982
983    #[test]
984    fn test_convert_unicode_string() {
985        // Test basic conversion
986        assert_eq!(
987            Deduplicator::convert_unicode_string("2<U+0391>-amino-4<U+0391>"),
988            "2Α-amino-4Α",
989            "Failed to convert basic Alpha Unicode sequences"
990        );
991
992        // Test multiple different Unicode sequences
993        assert_eq!(
994            Deduplicator::convert_unicode_string("Hello <U+03A9>orld <U+03A3>cience"),
995            "Hello Ωorld Σcience",
996            "Failed to convert multiple Unicode sequences"
997        );
998
999        // Test string with no Unicode sequences
1000        assert_eq!(
1001            Deduplicator::convert_unicode_string("Normal String"),
1002            "Normal String",
1003            "Incorrectly modified string with no Unicode sequences"
1004        );
1005
1006        // Test empty string
1007        assert_eq!(
1008            Deduplicator::convert_unicode_string(""),
1009            "",
1010            "Failed to handle empty string"
1011        );
1012
1013        // Test mixed content
1014        assert_eq!(
1015            Deduplicator::convert_unicode_string("Mixed <U+0394> Unicode <U+03A9> Test"),
1016            "Mixed Δ Unicode Ω Test",
1017            "Failed to handle mixed content with Unicode sequences"
1018        );
1019
1020        // Test consecutive Unicode sequences
1021        assert_eq!(
1022            Deduplicator::convert_unicode_string("<U+0391><U+0392><U+0393>"),
1023            "ΑΒΓ",
1024            "Failed to convert consecutive Unicode sequences"
1025        );
1026    }
1027
1028    #[test]
1029    fn test_normalize_volume() {
1030        assert_eq!(Deduplicator::normalize_volume("61"), "61");
1031        assert_eq!(Deduplicator::normalize_volume("61 (Supplement 1)"), "61");
1032        assert_eq!(Deduplicator::normalize_volume("9 (8) (no pagination)"), "9");
1033        assert_eq!(Deduplicator::normalize_volume("3)"), "3");
1034        assert_eq!(Deduplicator::normalize_volume("Part A. 242"), "242");
1035        assert_eq!(Deduplicator::normalize_volume("55 (10 SUPPL 1)"), "55");
1036        assert_eq!(Deduplicator::normalize_volume("161A"), "161");
1037        assert_eq!(Deduplicator::normalize_volume("74 Suppl 1"), "74");
1038        assert_eq!(Deduplicator::normalize_volume("20 (2)"), "20");
1039        assert_eq!(
1040            Deduplicator::normalize_volume("9 (FEB) (no pagination)"),
1041            "9"
1042        );
1043    }
1044
1045    #[test]
1046    fn test_format_journal_name() {
1047        assert_eq!(
1048            Deduplicator::format_journal_name(Some(
1049                "Heart. Conference: British Atherosclerosis Society BAS/British Society for Cardiovascular Research BSCR Annual Meeting"
1050            )),
1051            Some("heart".to_string())
1052        );
1053        assert_eq!(
1054            Deduplicator::format_journal_name(Some(
1055                "The FASEB Journal. Conference: Experimental Biology"
1056            )),
1057            Some("thefasebjournal".to_string())
1058        );
1059        assert_eq!(
1060            Deduplicator::format_journal_name(Some(
1061                "Arteriosclerosis Thrombosis and Vascular Biology. Conference: American Heart Association's Arteriosclerosis Thrombosis and Vascular Biology"
1062            )),
1063            Some("arteriosclerosisthrombosisandvascularbiology".to_string())
1064        );
1065        assert_eq!(Deduplicator::format_journal_name(None), None);
1066        assert_eq!(
1067            Deduplicator::format_journal_name(Some("")),
1068            Some("".to_string())
1069        );
1070        assert_eq!(
1071            Deduplicator::format_journal_name(Some("Diabetologie und Stoffwechsel. Conference")),
1072            Some("diabetologieundstoffwechsel".to_string())
1073        );
1074    }
1075
1076    #[test]
1077    fn test_match_issns_scenarios() {
1078        // Scenario 1: Matching lists
1079        let issns1 = vec!["1234-5678".to_string(), "8765-4321".to_string()];
1080        let issns2 = vec!["0000-0000".to_string(), "1234-5678".to_string()];
1081        assert!(
1082            Deduplicator::match_issns(&issns1, &issns2),
1083            "Should find a matching ISSN"
1084        );
1085
1086        let non_match_issns2 = vec!["5555-6666".to_string(), "7777-8888".to_string()];
1087        assert!(
1088            !Deduplicator::match_issns(&issns1, &non_match_issns2),
1089            "Should not find a matching ISSN"
1090        );
1091
1092        // Scenario 3: Empty lists
1093        let empty_issns1: Vec<String> = vec![];
1094        let empty_issns2: Vec<String> = vec![];
1095        assert!(
1096            !Deduplicator::match_issns(&empty_issns1, &empty_issns2),
1097            "Should return false for empty lists"
1098        );
1099
1100        // Scenario 4: One empty list
1101        let partial_issns1 = vec!["1234-5678".to_string()];
1102        let partial_issns2: Vec<String> = vec![];
1103        assert!(
1104            !Deduplicator::match_issns(&partial_issns1, &partial_issns2),
1105            "Should return false when one list is empty"
1106        );
1107    }
1108
1109    #[test]
1110    fn test_format_issn() {
1111        assert_eq!(
1112            Deduplicator::format_issn("1234-5678"),
1113            Some("1234-5678".to_string())
1114        );
1115        assert_eq!(
1116            Deduplicator::format_issn("12345678"),
1117            Some("1234-5678".to_string())
1118        );
1119        assert_eq!(
1120            Deduplicator::format_issn("1234-567X"),
1121            Some("1234-567X".to_string())
1122        );
1123        assert_eq!(
1124            Deduplicator::format_issn("1234-567X (Electronic)"),
1125            Some("1234-567X".to_string())
1126        );
1127        assert_eq!(
1128            Deduplicator::format_issn("1234-5678 (Print)"),
1129            Some("1234-5678".to_string())
1130        );
1131        assert_eq!(
1132            Deduplicator::format_issn("1234-5678 (Linking)"),
1133            Some("1234-5678".to_string())
1134        );
1135        assert_eq!(Deduplicator::format_issn("invalid"), None);
1136        assert_eq!(Deduplicator::format_issn("1234-56789"), None);
1137        assert_eq!(Deduplicator::format_issn("123-45678"), None);
1138    }
1139
1140    #[test]
1141    fn test_without_year_grouping() {
1142        let citations = vec![
1143            Citation {
1144                title: "Title 1".to_string(),
1145                date: Some(crate::Date {
1146                    year: 2020,
1147                    month: None,
1148                    day: None,
1149                }),
1150                doi: Some("10.1234/abc".to_string()),
1151                journal: Some("Journal 1".to_string()),
1152                ..Default::default()
1153            },
1154            Citation {
1155                title: "Title 1".to_string(),
1156                date: Some(crate::Date {
1157                    year: 2019, // Different year
1158                    month: None,
1159                    day: None,
1160                }),
1161                doi: Some("10.1234/abc".to_string()),
1162                journal: Some("Journal 1".to_string()),
1163                ..Default::default()
1164            },
1165        ];
1166
1167        let config = DeduplicatorConfig {
1168            group_by_year: false,
1169            ..Default::default()
1170        };
1171        let deduplicator = Deduplicator::new().with_config(config);
1172        let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1173
1174        assert_eq!(duplicate_groups.len(), 1);
1175        assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1176
1177        // Test with default year grouping (should not find duplicates across years)
1178        let deduplicator = Deduplicator::new();
1179        let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1180
1181        assert_eq!(duplicate_groups.len(), 2);
1182        assert!(duplicate_groups.iter().all(|g| g.duplicates.is_empty()));
1183    }
1184
1185    #[test]
1186    fn test_source_preferences() {
1187        let citations = vec![
1188            Citation {
1189                title: "Title 1".to_string(),
1190                doi: Some("10.1234/abc".to_string()),
1191                journal: Some("Journal 1".to_string()),
1192                date: Some(crate::Date {
1193                    year: 2020,
1194                    month: None,
1195                    day: None,
1196                }),
1197                ..Default::default()
1198            },
1199            Citation {
1200                title: "Title 1".to_string(),
1201                doi: Some("10.1234/abc".to_string()),
1202                journal: Some("Journal 1".to_string()),
1203                date: Some(crate::Date {
1204                    year: 2020,
1205                    month: None,
1206                    day: None,
1207                }),
1208                ..Default::default()
1209            },
1210        ];
1211
1212        let sources = vec!["source2", "source1"];
1213
1214        let config = DeduplicatorConfig {
1215            source_preferences: vec!["source1".to_string(), "source2".to_string()],
1216            ..Default::default()
1217        };
1218
1219        let deduplicator = Deduplicator::new().with_config(config);
1220        let duplicate_groups = deduplicator
1221            .find_duplicates_with_sources(&citations, &sources)
1222            .unwrap();
1223
1224        assert_eq!(duplicate_groups.len(), 1);
1225        // The second citation should be selected as unique because source1 (PubMed)
1226        // has higher priority than source2 (Embase) in our preferences
1227        assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1228    }
1229
1230    #[test]
1231    fn test_abstract_preference() {
1232        let citations = vec![
1233            Citation {
1234                title: "Title 1".to_string(),
1235                abstract_text: None,
1236                doi: Some("10.1234/abc".to_string()),
1237                journal: Some("Journal 1".to_string()),
1238                date: Some(crate::Date {
1239                    year: 2020,
1240                    month: None,
1241                    day: None,
1242                }),
1243                ..Default::default()
1244            },
1245            Citation {
1246                title: "Title 1".to_string(),
1247                abstract_text: Some("Abstract".to_string()),
1248                doi: Some("10.1234/abc".to_string()),
1249                journal: Some("Journal 1".to_string()),
1250                date: Some(crate::Date {
1251                    year: 2020,
1252                    month: None,
1253                    day: None,
1254                }),
1255                ..Default::default()
1256            },
1257        ];
1258
1259        let deduplicator = Deduplicator::new();
1260        let duplicate_groups = deduplicator.find_duplicates(&citations).unwrap();
1261
1262        assert_eq!(duplicate_groups.len(), 1);
1263        // The citation with abstract should be selected as unique
1264        assert!(duplicate_groups[0].unique.abstract_text.is_some());
1265        assert_eq!(duplicate_groups[0].duplicates.len(), 1);
1266    }
1267
1268    #[test]
1269    fn test_source_preferences_with_year_grouping() {
1270        // Create citations from different years to test year grouping with source preferences
1271        let citations = vec![
1272            Citation {
1273                title: "Test Article 2020".to_string(),
1274                doi: Some("10.1234/test2020".to_string()),
1275                journal: Some("Test Journal".to_string()),
1276                date: Some(crate::Date {
1277                    year: 2020,
1278                    month: None,
1279                    day: None,
1280                }),
1281                ..Default::default()
1282            },
1283            Citation {
1284                title: "Test Article 2020".to_string(), // Same as above but different source
1285                doi: Some("10.1234/test2020".to_string()),
1286                journal: Some("Test Journal".to_string()),
1287                date: Some(crate::Date {
1288                    year: 2020,
1289                    month: None,
1290                    day: None,
1291                }),
1292                ..Default::default()
1293            },
1294            Citation {
1295                title: "Test Article 2021".to_string(),
1296                doi: Some("10.1234/test2021".to_string()),
1297                journal: Some("Test Journal".to_string()),
1298                date: Some(crate::Date {
1299                    year: 2021,
1300                    month: None,
1301                    day: None,
1302                }),
1303                ..Default::default()
1304            },
1305            Citation {
1306                title: "Test Article 2021".to_string(), // Same as above but different source
1307                doi: Some("10.1234/test2021".to_string()),
1308                journal: Some("Test Journal".to_string()),
1309                date: Some(crate::Date {
1310                    year: 2021,
1311                    month: None,
1312                    day: None,
1313                }),
1314                ..Default::default()
1315            },
1316        ];
1317
1318        // Sources with PubMed having higher priority
1319        let sources = vec!["Embase", "PubMed", "Embase", "PubMed"];
1320
1321        let config = DeduplicatorConfig {
1322            group_by_year: true, // This is the key - enable year grouping
1323            run_in_parallel: false,
1324            source_preferences: vec!["PubMed".to_string(), "Embase".to_string()],
1325        };
1326
1327        let deduplicator = Deduplicator::new().with_config(config);
1328        let duplicate_groups = deduplicator
1329            .find_duplicates_with_sources(&citations, &sources)
1330            .unwrap();
1331
1332        // Should find 2 duplicate groups (one for each year)
1333        assert_eq!(duplicate_groups.len(), 2);
1334
1335        // Both unique citations should be from PubMed (indices 1 and 3)
1336        // We can't directly check the source, but we can check that each group has the expected structure
1337        let unique_titles: Vec<&str> = duplicate_groups
1338            .iter()
1339            .map(|group| group.unique.title.as_str())
1340            .collect();
1341
1342        assert!(unique_titles.contains(&"Test Article 2020"));
1343        assert!(unique_titles.contains(&"Test Article 2021"));
1344
1345        // Each group should have exactly one duplicate
1346        for group in &duplicate_groups {
1347            assert_eq!(group.duplicates.len(), 1);
1348        }
1349    }
1350}