use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, Write};
use url::Url;
use rayon::prelude::*;
fn normalize_path(url: &Url) -> String {
let segments: Vec<&str> = match url.path_segments() {
Some(s) => s.collect(),
None => return String::new(),
};
if segments.is_empty() {
return String::new();
}
let last_index = segments.len() - 1;
segments
.iter()
.enumerate()
.map(|(i, &seg)| {
if i != last_index && seg.chars().all(|c| c.is_ascii_digit()) {
"X"
} else {
seg
}
})
.collect::<Vec<_>>()
.join("/")
}
fn normalize_query(url: &Url) -> String {
let mut keys: Vec<String> = url
.query_pairs()
.map(|(k, _)| k.into_owned())
.collect();
keys.sort_unstable();
keys.join("&")
}
fn dedup_key(url: &Url) -> String {
format!(
"{}://{}{}?{}",
url.scheme(),
url.host_str().unwrap_or_default(),
normalize_path(url),
normalize_query(url)
)
}
fn deduplicate_urls_from_stdin(
show_params_extensions: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let stdin = io::stdin();
let reader = stdin.lock();
let lines: Vec<String> = reader.lines().collect::<Result<_, _>>()?;
let excluded_extensions: HashSet<&str> = [
"css", "jpg", "jpeg", "png", "gif",
"webp", "woff2", "woff", "ttf", "ico",
]
.into();
let parsed_urls: Vec<Url> = lines
.par_iter()
.filter_map(|line| {
let trimmed = line.trim();
let url = Url::parse(trimmed).ok()?;
let last_segment = url
.path_segments()
.and_then(|mut s| s.next_back())
.unwrap_or("");
let ext = last_segment
.rsplit('.')
.next()
.unwrap_or("")
.to_lowercase();
if excluded_extensions.contains(ext.as_str()) {
return None;
}
if show_params_extensions
&& url.query().is_none()
&& !last_segment.contains('.')
{
return None;
}
Some(url)
})
.collect();
let seen: HashMap<String, Url> = parsed_urls
.into_par_iter()
.fold(
HashMap::new,
|mut acc, url| {
let key = dedup_key(&url);
acc.entry(key).or_insert(url);
acc
},
)
.reduce(
HashMap::new,
|mut acc, map| {
acc.extend(map);
acc
},
);
let mut stdout = io::stdout();
for url in seen.into_values() {
writeln!(stdout, "{}", url)?;
}
Ok(())
}
fn main() {
let show_params_extensions = true;
if let Err(e) = deduplicate_urls_from_stdin(show_params_extensions) {
eprintln!("Error: {}", e);
}
}