use std::sync::LazyLock;
use regex::Regex;
use unicode_normalization::UnicodeNormalization;
use url::Url;
use crate::error::ToolsError;
static CONTROL_CHARS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]").unwrap());
static MULTI_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s{2,}").unwrap());
pub fn normalize_url(raw: &str) -> Result<String, ToolsError> {
let trimmed = raw.trim();
if trimmed.is_empty() {
return Err(ToolsError::InvalidUrl("empty URL".into()));
}
let mut parsed =
Url::parse(trimmed).map_err(|e| ToolsError::InvalidUrl(format!("{trimmed}: {e}")))?;
if let Some(host) = parsed.host_str() {
let lower_host = host.to_lowercase();
if lower_host != host {
parsed
.set_host(Some(&lower_host))
.map_err(|e| ToolsError::InvalidUrl(format!("host normalization: {e}")))?;
}
}
if parsed.query().is_some() {
let mut pairs: Vec<(String, String)> = parsed.query_pairs().into_owned().collect();
pairs.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)));
let mut query = parsed.query_pairs_mut();
query.clear();
for (key, value) in &pairs {
query.append_pair(key, value);
}
drop(query);
}
let mut result = parsed.to_string();
if result.ends_with('/') && parsed.path() == "/" && parsed.query().is_none() {
result.pop();
}
Ok(result)
}
pub fn normalize_title(title: &str) -> String {
let nfc: String = title.nfc().collect();
let no_controls = CONTROL_CHARS.replace_all(&nfc, "");
let collapsed = MULTI_WHITESPACE.replace_all(&no_controls, " ");
collapsed.trim().to_string()
}
pub fn extract_base_url(url: &str) -> Option<String> {
let parsed = Url::parse(url.trim()).ok()?;
let scheme = parsed.scheme();
let host = parsed.host_str()?;
match parsed.port() {
Some(port) => Some(format!("{scheme}://{host}:{port}")),
None => Some(format!("{scheme}://{host}")),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lowercases_scheme_and_host() {
let result = normalize_url("HTTP://Example.COM/path").unwrap();
assert!(result.starts_with("http://example.com/"));
}
#[test]
fn removes_trailing_slash_on_root() {
let result = normalize_url("http://example.com/").unwrap();
assert_eq!(result, "http://example.com");
}
#[test]
fn preserves_path_trailing_slash() {
let result = normalize_url("http://example.com/path/").unwrap();
assert!(result.contains("/path/"));
}
#[test]
fn sorts_query_params() {
let result = normalize_url("http://example.com/path?z=1&a=2").unwrap();
assert!(result.contains("a=2&z=1"));
}
#[test]
fn empty_url_errors() {
assert!(normalize_url("").is_err());
assert!(normalize_url(" ").is_err());
}
#[test]
fn invalid_url_errors() {
assert!(normalize_url("not a url").is_err());
}
#[test]
fn preserves_port() {
let result = normalize_url("http://example.com:8080/path").unwrap();
assert!(result.contains(":8080"));
}
#[test]
fn trims_and_collapses_whitespace() {
assert_eq!(normalize_title(" BBC One "), "BBC One");
}
#[test]
fn removes_control_characters() {
assert_eq!(normalize_title("BBC\x00One\x07Two"), "BBCOneTwo");
}
#[test]
fn applies_nfc_normalization() {
let input = "caf\u{0065}\u{0301}";
let result = normalize_title(input);
assert_eq!(result, "caf\u{00E9}");
}
#[test]
fn empty_input_returns_empty() {
assert_eq!(normalize_title(""), "");
assert_eq!(normalize_title(" "), "");
}
#[test]
fn preserves_normal_text() {
assert_eq!(normalize_title("Sky Sports 1"), "Sky Sports 1");
}
#[test]
fn extracts_scheme_host_port() {
assert_eq!(
extract_base_url("http://host:8080/path/to/thing?q=1"),
Some("http://host:8080".into()),
);
}
#[test]
fn extracts_without_port() {
assert_eq!(
extract_base_url("https://example.com/path"),
Some("https://example.com".into()),
);
}
#[test]
fn returns_none_for_invalid() {
assert_eq!(extract_base_url("not a url"), None);
}
#[test]
fn returns_none_for_empty() {
assert_eq!(extract_base_url(""), None);
}
}