use anyhow::Result;
use chrono::Utc;
use std::collections::{HashMap, HashSet};
use url::Url;
use crate::exporter::Bookmark;
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct DeduplicationConfig {
pub normalize_urls: bool,
pub ignore_query_params: bool,
pub ignore_fragment: bool,
pub ignore_www: bool,
pub ignore_protocol: bool,
pub case_sensitive: bool,
pub merge_strategy: MergeStrategy,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub enum MergeStrategy {
KeepFirst,
KeepLast,
KeepMostRecent,
KeepMostFrequent,
MergeMetadata,
}
impl Default for DeduplicationConfig {
fn default() -> Self {
Self {
normalize_urls: true,
ignore_query_params: true,
ignore_fragment: true,
ignore_www: true,
ignore_protocol: true,
case_sensitive: false,
merge_strategy: MergeStrategy::MergeMetadata,
}
}
}
#[derive(Debug)]
pub struct DeduplicationResult {
pub unique_bookmarks: Vec<Bookmark>,
pub duplicates_removed: usize,
pub duplicates_found: usize,
pub merge_summary: HashMap<String, usize>,
}
pub struct BookmarkDeduplicator {
config: DeduplicationConfig,
}
impl BookmarkDeduplicator {
pub fn new(config: DeduplicationConfig) -> Self {
Self { config }
}
pub fn deduplicate(&self, bookmarks: &[Bookmark]) -> Result<DeduplicationResult> {
let mut url_groups: HashMap<String, Vec<Bookmark>> = HashMap::new();
let mut seen_urls: HashSet<String> = HashSet::new();
for bookmark in bookmarks {
if let Some(ref url) = bookmark.url {
let normalized_url = self.normalize_url(url)?;
if seen_urls.contains(&normalized_url) {
if let Some(group) = url_groups.get_mut(&normalized_url) {
group.push(bookmark.clone());
}
} else {
seen_urls.insert(normalized_url.clone());
url_groups.insert(normalized_url, vec![bookmark.clone()]);
}
}
}
let mut unique_bookmarks = Vec::new();
let mut duplicates_removed = 0;
let mut duplicates_found = 0;
let mut merge_summary = HashMap::new();
for (normalized_url, group) in url_groups {
if group.len() == 1 {
unique_bookmarks.push(group.into_iter().next().unwrap());
} else {
duplicates_found += group.len() - 1;
let merged = self.merge_bookmarks(&group)?;
duplicates_removed += group.len() - 1;
merge_summary.insert(normalized_url, group.len());
unique_bookmarks.push(merged);
}
}
Ok(DeduplicationResult {
unique_bookmarks,
duplicates_removed,
duplicates_found,
merge_summary,
})
}
fn normalize_url(&self, url_str: &str) -> Result<String> {
let mut url = Url::parse(url_str)?;
if self.config.ignore_protocol {
url.set_scheme("http").ok(); }
if self.config.ignore_www {
let host = url.host_str().unwrap_or("").to_string();
if host.starts_with("www.") {
let new_host = &host[4..];
url.set_host(Some(new_host))?;
}
}
if self.config.normalize_urls {
let path = url.path().trim_end_matches('/').to_string();
url.set_path(&path);
}
if self.config.ignore_query_params {
url.set_query(None);
}
if self.config.ignore_fragment {
url.set_fragment(None);
}
let mut normalized = url.to_string();
if !self.config.case_sensitive {
normalized = normalized.to_lowercase();
}
Ok(normalized)
}
fn merge_bookmarks(&self, bookmarks: &[Bookmark]) -> Result<Bookmark> {
match self.config.merge_strategy {
MergeStrategy::KeepFirst => Ok(bookmarks[0].clone()),
MergeStrategy::KeepLast => Ok(bookmarks[bookmarks.len() - 1].clone()),
MergeStrategy::KeepMostRecent => {
let most_recent = bookmarks
.iter()
.max_by_key(|b| b.date_added.unwrap_or_else(Utc::now))
.unwrap();
Ok(most_recent.clone())
}
MergeStrategy::KeepMostFrequent => {
let mut title_counts: HashMap<String, usize> = HashMap::new();
for bookmark in bookmarks {
*title_counts.entry(bookmark.title.clone()).or_insert(0) += 1;
}
let most_frequent_title = title_counts
.iter()
.max_by_key(|(_, count)| *count)
.map(|(title, _)| title.clone())
.unwrap_or_else(|| bookmarks[0].title.clone());
let bookmark = bookmarks
.iter()
.find(|b| b.title == most_frequent_title)
.unwrap_or(&bookmarks[0]);
Ok(bookmark.clone())
}
MergeStrategy::MergeMetadata => {
let first_bookmark = &bookmarks[0];
let title = bookmarks
.iter()
.filter(|b| !b.title.is_empty())
.max_by_key(|b| b.date_added.unwrap_or_else(Utc::now))
.map(|b| b.title.clone())
.unwrap_or_else(|| first_bookmark.title.clone());
let date_added = bookmarks.iter().filter_map(|b| b.date_added).max();
let mut folders = Vec::new();
for bookmark in bookmarks {
if let Some(ref folder) = bookmark.folder {
if !folders.contains(&folder.clone()) {
folders.push(folder.clone());
}
}
}
let folder = if folders.is_empty() {
None
} else if folders.len() == 1 {
Some(folders[0].clone())
} else {
Some(format!("Merged: {}", folders.join(", ")))
};
Ok(Bookmark {
id: first_bookmark.id.clone(),
title,
url: first_bookmark.url.clone(),
folder,
date_added,
children: None, })
}
}
}
}
pub fn find_potential_duplicates(bookmarks: &[Bookmark]) -> Result<Vec<(Bookmark, Bookmark, f64)>> {
let mut duplicates = Vec::new();
for (i, bookmark1) in bookmarks.iter().enumerate() {
for bookmark2 in bookmarks.iter().skip(i + 1) {
if let (Some(url1), Some(url2)) = (&bookmark1.url, &bookmark2.url) {
let similarity = calculate_url_similarity(url1, url2)?;
if similarity > 0.8 {
duplicates.push((bookmark1.clone(), bookmark2.clone(), similarity));
}
}
}
}
Ok(duplicates)
}
fn calculate_url_similarity(url1: &str, url2: &str) -> Result<f64> {
let parsed1 = Url::parse(url1)?;
let parsed2 = Url::parse(url2)?;
let mut score = 0.0;
let mut factors = 0;
if parsed1.host_str() == parsed2.host_str() {
score += 0.5;
}
factors += 1;
let path1 = normalize_path(parsed1.path());
let path2 = normalize_path(parsed2.path());
if path1 == path2 {
score += 0.3;
} else {
let path_parts1: HashSet<_> = path1.split('/').collect();
let path_parts2: HashSet<_> = path2.split('/').collect();
let path_similarity = jaccard_similarity(&path_parts1, &path_parts2);
score += path_similarity * 0.3;
}
factors += 1;
if let (Some(query1), Some(query2)) = (parsed1.query(), parsed2.query()) {
let params1: HashSet<&str> = query1.split('&').collect();
let params2: HashSet<&str> = query2.split('&').collect();
let query_similarity = jaccard_similarity(¶ms1, ¶ms2);
score += query_similarity * 0.2;
} else if parsed1.query().is_none() && parsed2.query().is_none() {
score += 0.2;
}
factors += 1;
Ok(score / factors as f64)
}
fn normalize_path(path: &str) -> String {
path.trim_end_matches('/').to_lowercase()
}
fn jaccard_similarity<T: std::hash::Hash + Eq>(set1: &HashSet<T>, set2: &HashSet<T>) -> f64 {
if set1.is_empty() && set2.is_empty() {
return 1.0;
}
let intersection: HashSet<_> = set1.intersection(set2).collect();
let union: HashSet<_> = set1.union(set2).collect();
intersection.len() as f64 / union.len() as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_url_normalization() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
assert_eq!(
deduplicator
.normalize_url("https://www.example.com/path?param=value#section")
.unwrap(),
"http://example.com/path"
);
assert_eq!(
deduplicator
.normalize_url("https://example.com/path")
.unwrap(),
"http://example.com/path"
);
}
#[test]
fn test_deduplication() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let bookmarks = vec![
Bookmark {
id: "1".to_string(),
title: "Example".to_string(),
url: Some("https://www.example.com".to_string()),
folder: Some("folder1".to_string()),
date_added: None,
children: None,
},
Bookmark {
id: "2".to_string(),
title: "Example Site".to_string(),
url: Some("http://example.com".to_string()),
folder: Some("folder2".to_string()),
date_added: None,
children: None,
},
];
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks.len(), 1);
assert_eq!(result.duplicates_removed, 1);
}
#[test]
fn test_trailing_slash_normalization() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
assert_eq!(
deduplicator.normalize_url("https://example.com/path/").unwrap(),
"http://example.com/path"
);
assert_eq!(
deduplicator.normalize_url("https://example.com/path").unwrap(),
"http://example.com/path"
);
}
#[test]
fn test_query_params_removal() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
assert_eq!(
deduplicator
.normalize_url("https://example.com/path?utm_source=google&id=123")
.unwrap(),
"http://example.com/path"
);
assert_eq!(
deduplicator
.normalize_url("https://example.com/path#section")
.unwrap(),
"http://example.com/path"
);
}
#[test]
fn test_case_insensitive() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let url1 = deduplicator.normalize_url("https://EXAMPLE.com/PATH").unwrap();
let url2 = deduplicator.normalize_url("https://example.com/path").unwrap();
assert_eq!(url1, url2);
}
#[test]
fn test_multiple_duplicates() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let bookmarks = vec![
Bookmark {
id: "1".to_string(),
title: "Example 1".to_string(),
url: Some("https://www.example.com".to_string()),
folder: None,
date_added: None,
children: None,
},
Bookmark {
id: "2".to_string(),
title: "Example 2".to_string(),
url: Some("http://example.com".to_string()),
folder: None,
date_added: None,
children: None,
},
Bookmark {
id: "3".to_string(),
title: "Example 3".to_string(),
url: Some("https://example.com/".to_string()),
folder: None,
date_added: None,
children: None,
},
];
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks.len(), 1);
assert_eq!(result.duplicates_removed, 2);
assert_eq!(result.duplicates_found, 2);
}
#[test]
fn test_no_duplicates() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let bookmarks = vec![
Bookmark {
id: "1".to_string(),
title: "GitHub".to_string(),
url: Some("https://github.com".to_string()),
folder: None,
date_added: None,
children: None,
},
Bookmark {
id: "2".to_string(),
title: "Rust".to_string(),
url: Some("https://rust-lang.org".to_string()),
folder: None,
date_added: None,
children: None,
},
];
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks.len(), 2);
assert_eq!(result.duplicates_removed, 0);
assert_eq!(result.duplicates_found, 0);
}
#[test]
fn test_bookmark_without_url() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let bookmarks = vec![
Bookmark {
id: "1".to_string(),
title: "Example".to_string(),
url: Some("https://example.com".to_string()),
folder: None,
date_added: None,
children: None,
},
Bookmark {
id: "2".to_string(),
title: "No URL".to_string(),
url: None,
folder: None,
date_added: None,
children: None,
},
];
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks.len(), 1);
}
#[test]
fn test_merge_strategies() {
use chrono::Utc;
let bookmarks = vec![
Bookmark {
id: "1".to_string(),
title: "First".to_string(),
url: Some("https://example.com".to_string()),
folder: Some("folder1".to_string()),
date_added: None,
children: None,
},
Bookmark {
id: "2".to_string(),
title: "Last".to_string(),
url: Some("http://example.com".to_string()),
folder: Some("folder2".to_string()),
date_added: Some(Utc::now()),
children: None,
},
];
let config = DeduplicationConfig {
merge_strategy: MergeStrategy::KeepFirst,
..Default::default()
};
let deduplicator = BookmarkDeduplicator::new(config);
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks[0].title, "First");
assert_eq!(result.unique_bookmarks[0].id, "1");
let config = DeduplicationConfig {
merge_strategy: MergeStrategy::KeepLast,
..Default::default()
};
let deduplicator = BookmarkDeduplicator::new(config);
let result = deduplicator.deduplicate(&bookmarks).unwrap();
assert_eq!(result.unique_bookmarks[0].title, "Last");
assert_eq!(result.unique_bookmarks[0].id, "2");
}
#[test]
fn test_complex_url_normalization() {
let config = DeduplicationConfig::default();
let deduplicator = BookmarkDeduplicator::new(config);
let test_cases = vec![
("https://www.example.com/path", "http://example.com/path"),
("https://example.com/path/", "http://example.com/path"),
("https://example.com/path?foo=bar", "http://example.com/path"),
("https://example.com/path#section", "http://example.com/path"),
("https://EXAMPLE.COM/path", "http://example.com/path"),
("http://www.example.com/path/", "http://example.com/path"),
];
for (input, expected) in test_cases {
let result = deduplicator.normalize_url(input).unwrap();
assert_eq!(result, expected, "Failed for input: {}", input);
}
}
}