use url::Url;
fn clean_url_path(u: &mut Url) {
let path = u.path().to_owned();
if path.len() > 1 && path.ends_with('/') {
u.set_path(&path[..path.len() - 1]);
}
let path = u.path().to_owned();
if path.contains("//") {
u.set_path(&path.replace("//", "/"));
}
}
pub(crate) fn normalize_url(raw: &str) -> String {
if let Ok(mut u) = Url::parse(raw) {
u.set_fragment(None);
let pairs: Vec<(String, String)> = u.query_pairs().map(|(k, v)| (k.into_owned(), v.into_owned())).collect();
if !pairs.is_empty() {
let mut sorted = pairs;
sorted.sort();
let query_str: String = sorted
.iter()
.map(|(k, v)| format!("{k}={v}"))
.collect::<Vec<_>>()
.join("&");
u.set_query(Some(&query_str));
}
clean_url_path(&mut u);
u.to_string()
} else {
raw.to_owned()
}
}
#[cfg(not(target_arch = "wasm32"))]
pub(crate) fn normalize_url_for_dedup(raw: &str) -> String {
if let Ok(mut u) = Url::parse(raw) {
u.set_fragment(None);
u.set_query(None);
clean_url_path(&mut u);
u.to_string()
} else {
raw.to_owned()
}
}
pub(crate) fn robots_url(parsed: &Url) -> String {
format!("{}://{}/robots.txt", parsed.scheme(), parsed.authority())
}
pub(crate) fn strip_fragment(url: &str) -> String {
if let Ok(mut u) = Url::parse(url) {
u.set_fragment(None);
u.to_string()
} else {
url.to_owned()
}
}
pub(crate) fn rewrite_url_host(url_str: &str, base: &Url) -> String {
if let Ok(parsed) = Url::parse(url_str)
&& parsed.host_str() != base.host_str()
{
let mut resolved = base.clone();
resolved.set_path(parsed.path());
resolved.set_query(parsed.query());
return resolved.to_string();
}
url_str.to_owned()
}
pub(crate) fn resolve_redirect(base_url: &str, target: &str) -> String {
if target.starts_with("http://") || target.starts_with("https://") {
return target.to_owned();
}
if let Ok(base) = Url::parse(base_url)
&& let Ok(resolved) = base.join(target)
{
return resolved.to_string();
}
target.to_owned()
}