use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::BTreeMap;
use unicode_normalization::UnicodeNormalization;
use url::Url;
static WHITESPACE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").expect("valid regex"));
static JSONLD_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<script[^>]*type=["']application/ld\+json["'][^>]*>.*?</script>"#)
.expect("valid regex")
});
static SCRIPT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("valid regex"));
static STYLE_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("valid regex"));
static NOSCRIPT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?is)<noscript[^>]*>.*?</noscript>").expect("valid regex"));
static IFRAME_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?is)<iframe[^>]*>.*?</iframe>").expect("valid regex"));
static SVG_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?is)<svg[^>]*>.*?</svg>").expect("valid regex"));
static COMMENT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?s)<!--.*?-->").expect("valid regex"));
static JUNK_ATTR_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"(?ix)
\s+ # Leading whitespace
(?: # Attribute name (non-capturing group)
class|id|style| # Common styling attributes
data-[\w-]+| # All data-* attributes
aria-[\w-]+| # All aria-* attributes
role|tabindex| # Accessibility attributes
xmlns(?::[\w-]+)?| # XML namespaces
version|viewBox| # SVG attributes
fill|fill-rule|stroke(?:-[\w-]+)?| # SVG styling
onclick|onload|on[\w-]+ # Event handlers
)
\s*=\s* # Equals with optional whitespace
(?: # Value (non-capturing group)
"[^"]*"| # Double-quoted value
'[^']*'| # Single-quoted value
[^\s>]+ # Unquoted value
)
"#,
)
.expect("valid regex")
});
static NEWLINE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"\\n").expect("valid regex"));
const TRACKING_PARAM_PREFIXES: &[&str] = &["utm_"];
const TRACKING_PARAM_NAMES: &[&str] = &[
"fbclid", "gclid", "mc_eid", "mc_cid", "_ga", "igshid", "ref_src", "ref_url", ];
fn is_tracking_param(key: &str) -> bool {
TRACKING_PARAM_PREFIXES
.iter()
.any(|prefix| key.starts_with(prefix))
|| TRACKING_PARAM_NAMES.contains(&key)
}
pub(super) fn decode_html_entities(text: &str) -> String {
html_escape::decode_html_entities(text).to_string()
}
pub(super) fn normalize_unicode(text: &str) -> String {
text.nfc().collect::<String>()
}
pub(super) fn remove_zero_width_chars(text: &str) -> String {
text.chars()
.filter(|c| {
!matches!(
*c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' )
})
.collect()
}
pub(super) fn remove_control_chars(text: &str) -> String {
text.chars()
.map(|c| if c == '\r' { '\n' } else { c })
.filter(|c| !c.is_control() || *c == '\n' || *c == '\t')
.collect()
}
pub(super) fn normalize_whitespace(text: &str) -> String {
WHITESPACE_REGEX.replace_all(text, " ").trim().to_string()
}
pub(super) fn normalize_escaped_newlines(text: &str) -> String {
NEWLINE_REGEX.replace_all(text, "\n").to_string()
}
pub fn canonicalize_domain(host: &str) -> String {
let lower = host.to_ascii_lowercase();
let idna = idna::domain_to_ascii(&lower).unwrap_or(lower);
if idna.starts_with("www.") && idna.len() > 4 {
idna[4..].to_string()
} else {
idna
}
}
pub fn canonicalize_url(url: &str) -> String {
let url_lower = url.to_ascii_lowercase();
let url_with_protocol = if url_lower.starts_with("http://") || url_lower.starts_with("https://")
{
url.to_string()
} else if url.contains('.') {
format!("https://{}", url)
} else {
url.to_string()
};
let mut parsed = match Url::parse(&url_with_protocol) {
Ok(u) => u,
Err(_) => return url.to_string(), };
let _ = parsed.set_scheme("https");
if let Some(host) = parsed.host_str() {
let canonical_host = canonicalize_domain(host);
let _ = parsed.set_host(Some(&canonical_host));
}
let path = parsed.path().to_string();
let normalized = path.trim_end_matches('/');
let new_path = if normalized.is_empty() {
""
} else {
normalized
};
parsed.set_path(new_path);
if parsed.query().is_some() {
let params: BTreeMap<_, _> = parsed
.query_pairs()
.filter(|(k, _)| !is_tracking_param(k.as_ref()))
.collect();
if !params.is_empty() {
let sorted_query = params
.iter()
.map(|(k, v)| format!("{}={}", k, v))
.collect::<Vec<_>>()
.join("&");
parsed.set_query(Some(&sorted_query));
} else {
parsed.set_query(None);
}
}
parsed.set_fragment(None);
parsed.to_string().trim_end_matches('/').to_string()
}
pub(super) fn clean_email(email: &str) -> String {
let mut result = email.trim().to_string();
result = result.trim_end_matches(&[',', ';', '.'][..]).to_string();
if let Some(start) = result.find('<') {
if let Some(end) = result.find('>') {
result = result[start + 1..end].to_string();
}
}
result = urlencoding::decode(&result)
.unwrap_or(std::borrow::Cow::Borrowed(&result))
.to_string();
result = result.trim().to_string();
result = result.to_ascii_lowercase();
if let Some(at_pos) = result.find('@') {
if result.matches('@').count() != 1 {
return String::new();
}
let (_local, domain) = result.split_at(at_pos);
let domain = &domain[1..];
if !domain.contains('.') {
return String::new();
}
if let Some(tld) = domain.split('.').next_back() {
if tld.len() < 2 || tld.len() > 10 || !tld.chars().all(|c| c.is_ascii_alphabetic()) {
return String::new();
}
let file_extensions = [
"js", "css", "jpg", "jpeg", "png", "gif", "svg", "webp", "ico", "pdf", "doc",
"docx", "xls", "xlsx", "zip", "tar", "gz", "mp3", "mp4", "avi", "mov", "prod",
];
if file_extensions.contains(&tld) {
return String::new();
}
}
} else {
return String::new();
}
result
}
pub(super) fn clean_phone(phone: &str) -> String {
let mut result = phone.trim().to_string();
if let Some(pos) = result
.to_lowercase()
.find(" ext")
.or_else(|| result.to_lowercase().find(" x"))
.or_else(|| result.to_lowercase().find(" extension"))
{
result = result[..pos].to_string();
}
let has_plus = result.starts_with('+');
let digits: String = result.chars().filter(|c| c.is_ascii_digit()).collect();
if has_plus {
format!("+{}", digits)
} else {
digits
}
}
pub(super) fn strip_junk(html: &str) -> String {
let jsonld_scripts: Vec<String> = JSONLD_REGEX
.captures_iter(html)
.map(|cap| cap.get(0).unwrap().as_str().to_string())
.collect();
let mut cleaned = SCRIPT_REGEX.replace_all(html, "").to_string();
for jsonld in jsonld_scripts {
cleaned = format!("{}{}", cleaned, jsonld);
}
cleaned = STYLE_REGEX.replace_all(&cleaned, "").to_string();
cleaned = NOSCRIPT_REGEX.replace_all(&cleaned, "").to_string();
cleaned = IFRAME_REGEX.replace_all(&cleaned, "").to_string();
cleaned = SVG_REGEX.replace_all(&cleaned, "").to_string();
cleaned = COMMENT_REGEX.replace_all(&cleaned, "").to_string();
cleaned = JUNK_ATTR_REGEX.replace_all(&cleaned, "").to_string();
cleaned
}