ureld 0.1.43

Ureld is a simple & fast URLs de-cluttering tool written in Rust.
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, Write};
use url::Url;
use rayon::prelude::*;

fn normalize_path(url: &Url) -> String {
    let segments: Vec<&str> = match url.path_segments() {
        Some(s) => s.collect(),
        None => return String::new(),
    };

    if segments.is_empty() {
        return String::new();
    }

    let last_index = segments.len() - 1;

    segments
        .iter()
        .enumerate()
        .map(|(i, &seg)| {
            if i != last_index && seg.chars().all(|c| c.is_ascii_digit()) {
                "X"
            } else {
                seg
            }
        })
        .collect::<Vec<_>>()
        .join("/")
}

fn normalize_query(url: &Url) -> String {
    let mut keys: Vec<String> = url
        .query_pairs()
        .map(|(k, _)| k.into_owned())
        .collect();

    keys.sort_unstable();
    keys.join("&")
}

fn dedup_key(url: &Url) -> String {
    format!(
        "{}://{}{}?{}",
        url.scheme(),
        url.host_str().unwrap_or_default(),
        normalize_path(url),
        normalize_query(url)
    )
}

fn deduplicate_urls_from_stdin(
    show_params_extensions: bool,
) -> Result<(), Box<dyn std::error::Error>> {
    let stdin = io::stdin();
    let reader = stdin.lock();
    let lines: Vec<String> = reader.lines().collect::<Result<_, _>>()?;

    let excluded_extensions: HashSet<&str> = [
        "css", "jpg", "jpeg", "png", "gif",
        "webp", "woff2", "woff", "ttf", "ico",
    ]
    .into();

    let parsed_urls: Vec<Url> = lines
        .par_iter()
        .filter_map(|line| {
            let trimmed = line.trim();
            let url = Url::parse(trimmed).ok()?;

            let last_segment = url
                .path_segments()
                .and_then(|mut s| s.next_back())
                .unwrap_or("");

            let ext = last_segment
                .rsplit('.')
                .next()
                .unwrap_or("")
                .to_lowercase();

            if excluded_extensions.contains(ext.as_str()) {
                return None;
            }

            if show_params_extensions
                && url.query().is_none()
                && !last_segment.contains('.')
            {
                return None;
            }

            Some(url)
        })
        .collect();

    let seen: HashMap<String, Url> = parsed_urls
        .into_par_iter()
        .fold(
            HashMap::new,
            |mut acc, url| {
                let key = dedup_key(&url);
                acc.entry(key).or_insert(url);
                acc
            },
        )
        .reduce(
            HashMap::new,
            |mut acc, map| {
                acc.extend(map);
                acc
            },
        );

    let mut stdout = io::stdout();
    for url in seen.into_values() {
        writeln!(stdout, "{}", url)?;
    }

    Ok(())
}

fn main() {
    let show_params_extensions = true;

    if let Err(e) = deduplicate_urls_from_stdin(show_params_extensions) {
        eprintln!("Error: {}", e);
    }
}