use std::collections::HashSet;
use url::Url;
pub fn generate_url_permutations(url: &str) -> Vec<String> {
let mut perms = HashSet::new();
let Ok(parsed) = Url::parse(url) else {
return vec![url.to_string()];
};
for scheme in ["http", "https"] {
for www in [true, false] {
for trailing_slash in [true, false] {
for index_file in [None, Some("index.html"), Some("index.php")] {
let mut perm_url = parsed.clone();
if perm_url.set_scheme(scheme).is_err() {
continue;
}
if let Some(host) = perm_url.host_str() {
let new_host = if www && !host.starts_with("www.") {
format!("www.{}", host)
} else if !www && host.starts_with("www.") {
host.strip_prefix("www.").unwrap_or(host).to_string()
} else {
host.to_string()
};
if perm_url.set_host(Some(&new_host)).is_err() {
continue;
}
}
let mut path = perm_url.path().to_string();
if let Some(index) = index_file {
if !path.ends_with(index) {
if path.ends_with('/') {
path = format!("{}{}", path, index);
} else {
path = format!("{}/{}", path, index);
}
}
} else {
if path.ends_with("/index.html") {
path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
} else if path.ends_with("/index.php") {
path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
}
}
if trailing_slash {
if !path.ends_with('/') && !path.is_empty() {
path = format!("{}/", path);
}
} else if path.ends_with('/') && path != "/" {
path = path.strip_suffix('/').unwrap_or(&path).to_string();
}
if path.is_empty() {
path = "/".to_string();
}
perm_url.set_path(&path);
perms.insert(perm_url.to_string());
}
}
}
}
perms.into_iter().collect()
}
pub fn normalize_url(url: &str) -> String {
let Ok(mut parsed) = Url::parse(url) else {
return url.to_string();
};
if parsed.set_scheme("https").is_err() {
return url.to_string();
}
let host_str = parsed.host_str().map(|s| s.to_string());
if let Some(host) = host_str {
if host.starts_with("www.") {
if let Some(without_www) = host.strip_prefix("www.") {
if parsed.set_host(Some(without_www)).is_err() {
return url.to_string();
}
}
}
}
let mut path = parsed.path().to_string();
while path.len() > 1 && path.ends_with('/') {
path = path.strip_suffix('/').unwrap_or(&path).to_string();
}
if path.ends_with("/index.html") {
path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
} else if path.ends_with("/index.php") {
path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
} else if path == "index.html" || path == "index.php" {
path = "/".to_string();
}
if path.is_empty() {
path = "/".to_string();
}
parsed.set_path(&path);
let query_pairs: Vec<(String, String)> = parsed.query_pairs()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
if !query_pairs.is_empty() {
let mut sorted_pairs = query_pairs;
sorted_pairs.sort_by(|a, b| a.0.cmp(&b.0));
parsed.query_pairs_mut().clear();
for (key, value) in sorted_pairs {
parsed.query_pairs_mut().append_pair(&key, &value);
}
}
parsed.set_fragment(None);
parsed.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_url_removes_www() {
assert_eq!(
normalize_url("https://www.example.com/page"),
"https://example.com/page"
);
assert_eq!(
normalize_url("https://www.subdomain.example.com/page"),
"https://subdomain.example.com/page"
);
}
#[test]
fn test_normalize_url_prefers_https() {
assert_eq!(
normalize_url("http://example.com/page"),
"https://example.com/page"
);
assert_eq!(
normalize_url("http://www.example.com/page"),
"https://example.com/page"
);
}
#[test]
fn test_normalize_url_removes_trailing_slash() {
assert_eq!(
normalize_url("https://example.com/page/"),
"https://example.com/page"
);
assert_eq!(
normalize_url("https://example.com/"),
"https://example.com/"
);
assert_eq!(
normalize_url("https://example.com"),
"https://example.com/"
);
}
#[test]
fn test_normalize_url_removes_index_files() {
assert_eq!(
normalize_url("https://example.com/page/index.html"),
"https://example.com/page"
);
assert_eq!(
normalize_url("https://example.com/page/index.php"),
"https://example.com/page"
);
assert_eq!(
normalize_url("https://example.com/index.html"),
"https://example.com/"
);
}
#[test]
fn test_normalize_url_sorts_query_params() {
assert_eq!(
normalize_url("https://example.com/page?z=1&a=2"),
"https://example.com/page?a=2&z=1"
);
assert_eq!(
normalize_url("https://example.com/page?c=3&b=2&a=1"),
"https://example.com/page?a=1&b=2&c=3"
);
}
#[test]
fn test_normalize_url_removes_fragment() {
assert_eq!(
normalize_url("https://example.com/page#section"),
"https://example.com/page"
);
assert_eq!(
normalize_url("https://example.com/page?key=value#section"),
"https://example.com/page?key=value"
);
}
#[test]
fn test_generate_permutations_count() {
let perms = generate_url_permutations("https://example.com/page");
assert!(perms.len() >= 8 && perms.len() <= 32, "Expected 8-32 permutations, got {}", perms.len());
}
#[test]
fn test_generate_permutations_includes_variants() {
let perms = generate_url_permutations("https://example.com/page");
assert!(perms.contains(&"http://example.com/page".to_string()),
"Should include http variant");
assert!(perms.contains(&"https://www.example.com/page".to_string()),
"Should include www variant");
assert!(perms.contains(&"https://example.com/page/".to_string()),
"Should include trailing slash variant");
}
#[test]
fn test_normalization_idempotent() {
let url = "https://example.com/page";
assert_eq!(
normalize_url(&normalize_url(url)),
normalize_url(url),
"Normalization should be idempotent"
);
let complex_url = "http://www.example.com/page/?z=1&a=2#section";
assert_eq!(
normalize_url(&normalize_url(complex_url)),
normalize_url(complex_url),
"Complex URL normalization should be idempotent"
);
}
#[test]
fn test_all_permutations_normalize_to_same() {
let perms = generate_url_permutations("https://example.com/page");
let normalized: Vec<_> = perms.iter().map(|p| normalize_url(p)).collect();
let unique: HashSet<_> = normalized.iter().collect();
if unique.len() > 1 {
eprintln!("Unique normalized URLs: {:?}", unique);
for perm in &perms {
eprintln!(" {} -> {}", perm, normalize_url(perm));
}
}
let first = &normalized[0];
assert!(
normalized.iter().all(|n| n == first),
"All permutations should normalize to the same URL. Got: {:?}",
unique
);
}
#[test]
fn test_normalize_url_with_port() {
assert_eq!(
normalize_url("http://example.com:8080/page"),
"https://example.com:8080/page"
);
assert_eq!(
normalize_url("http://www.example.com:8080/page/"),
"https://example.com:8080/page"
);
}
#[test]
fn test_normalize_url_with_userinfo() {
let url_with_user = "http://user:pass@example.com/page";
let normalized = normalize_url(url_with_user);
assert!(normalized.contains("user:pass@"));
assert!(normalized.starts_with("https://"));
}
#[test]
fn test_normalize_invalid_url() {
let invalid = "not a valid url";
assert_eq!(normalize_url(invalid), invalid, "Invalid URLs should be returned as-is");
}
#[test]
fn test_generate_permutations_invalid_url() {
let invalid = "not a valid url";
let perms = generate_url_permutations(invalid);
assert_eq!(perms.len(), 1, "Invalid URLs should return single element");
assert_eq!(perms[0], invalid, "Invalid URLs should be returned as-is");
}
#[test]
fn test_normalize_url_mixed_case() {
assert_eq!(
normalize_url("HTTP://WWW.EXAMPLE.COM/Page"),
"https://example.com/Page"
);
let normalized = normalize_url("HTTPS://EXAMPLE.COM/MyPage");
assert!(normalized.starts_with("https://example.com/"));
assert!(normalized.contains("/MyPage"));
}
#[test]
fn test_normalize_url_non_ascii() {
let url = "https://example.com/café";
let normalized = normalize_url(url);
assert!(normalized.contains("caf"), "Should handle non-ASCII characters");
}
#[test]
fn test_normalize_url_empty_path() {
assert_eq!(
normalize_url("https://example.com"),
"https://example.com/"
);
}
#[test]
fn test_normalize_complex_query_params() {
let url = "https://example.com/page?name=John&age=30&city=Boston";
let normalized = normalize_url(url);
assert!(normalized.contains("age=30"));
assert!(normalized.contains("city="));
assert!(normalized.contains("name="));
let age_pos = normalized.find("age=").unwrap();
let city_pos = normalized.find("city=").unwrap();
let name_pos = normalized.find("name=").unwrap();
assert!(age_pos < city_pos, "age should come before city");
assert!(city_pos < name_pos, "city should come before name");
}
#[test]
fn test_normalize_url_preserves_subdomain() {
assert_eq!(
normalize_url("https://blog.example.com/page"),
"https://blog.example.com/page"
);
assert_eq!(
normalize_url("https://www.blog.example.com/page"),
"https://blog.example.com/page"
);
}
#[test]
fn test_normalize_multiple_trailing_slashes() {
assert_eq!(
normalize_url("https://example.com/page///"),
"https://example.com/page"
);
}
#[test]
fn test_permutations_with_query_params() {
let url = "https://example.com/page?key=value";
let perms = generate_url_permutations(url);
assert!(perms.iter().any(|p| p.contains("key=value")));
assert!(perms.len() >= 8);
}
#[test]
fn test_normalize_performance() {
use std::time::Instant;
let test_urls = vec![
"http://www.example.com/page/",
"https://example.com/page?z=1&a=2",
"http://www.example.com/page/index.html#section",
"https://subdomain.example.com/path/to/page/",
];
let iterations = 1000;
let start = Instant::now();
for _ in 0..iterations {
for url in &test_urls {
let _ = normalize_url(url);
}
}
let elapsed = start.elapsed();
let avg_per_url = elapsed / (iterations * test_urls.len() as u32);
assert!(
avg_per_url.as_micros() < 50,
"Normalization took {}μs, expected <50μs",
avg_per_url.as_micros()
);
}
#[test]
fn test_normalize_url_special_paths() {
assert_eq!(
normalize_url("https://example.com/path/with-dashes"),
"https://example.com/path/with-dashes"
);
assert_eq!(
normalize_url("https://example.com/path_with_underscores"),
"https://example.com/path_with_underscores"
);
assert_eq!(
normalize_url("https://example.com/path.with.dots"),
"https://example.com/path.with.dots"
);
}
#[test]
fn test_normalize_url_removes_default_ports() {
let url = "https://example.com:443/page";
let normalized = normalize_url(url);
assert!(!normalized.contains(":443") || normalized == "https://example.com:443/page");
}
}
#[cfg(test)]
mod demo {
use crate::crawler::url_normalization::{normalize_url, generate_url_permutations};
#[test]
fn demo_normalization() {
println!("\n=== URL Normalization Demo ===\n");
let test_cases = vec![
"http://www.example.com/page/",
"https://example.com/page?z=1&a=2",
"http://www.example.com/index.html#section",
"https://example.com/page/index.php/",
];
for url in test_cases {
let normalized = normalize_url(url);
println!(" {} \n → {}\n", url, normalized);
}
}
#[test]
fn demo_permutations() {
println!("\n=== URL Permutations Demo ===\n");
let url = "https://example.com/page";
let perms = generate_url_permutations(url);
println!("Base URL: {}", url);
println!("Generated {} permutations:\n", perms.len());
for (i, perm) in perms.iter().enumerate().take(10) {
println!(" {}. {}", i + 1, perm);
}
if perms.len() > 10 {
println!(" ... and {} more", perms.len() - 10);
}
let normalized: std::collections::HashSet<_> = perms.iter()
.map(|p| normalize_url(p))
.collect();
println!("\nAll {} permutations normalize to {} unique URL(s):",
perms.len(), normalized.len());
for norm in normalized {
println!(" → {}", norm);
}
}
}