mod tests;
pub mod utils;
use serde::{Deserialize, Deserializer, Serialize};
pub use utils::canonicalize_url;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize)]
#[serde(transparent)]
pub struct CanonicalUrl(String);
impl CanonicalUrl {
pub fn new(raw: &str) -> Self {
Self(canonicalize_url(raw))
}
pub fn as_str(&self) -> &str {
&self.0
}
pub fn into_inner(self) -> String {
self.0
}
}
impl AsRef<str> for CanonicalUrl {
fn as_ref(&self) -> &str {
&self.0
}
}
impl std::fmt::Display for CanonicalUrl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
impl From<&str> for CanonicalUrl {
fn from(s: &str) -> Self {
Self::new(s)
}
}
impl From<String> for CanonicalUrl {
fn from(s: String) -> Self {
Self::new(&s)
}
}
impl<'de> Deserialize<'de> for CanonicalUrl {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let raw = String::deserialize(deserializer)?;
Ok(Self::new(&raw))
}
}
pub fn clean_text(text: &str) -> String {
let mut result = utils::decode_html_entities(text);
result = utils::normalize_unicode(&result);
result = utils::remove_zero_width_chars(&result);
result = utils::remove_control_chars(&result);
result = utils::normalize_whitespace(&result);
result
}
pub async fn clean_html(html: &str) -> String {
let html = html.to_string();
tokio::task::spawn_blocking(move || {
let mut result = html;
result = utils::normalize_escaped_newlines(&result);
result = utils::strip_junk(&result);
result = utils::normalize_whitespace(&result);
result
})
.await
.expect("clean_html: spawn_blocking failed")
}
pub fn clean_urls(urls: &[String]) -> Vec<String> {
crate::dedupe!(urls, utils::canonicalize_url)
}
pub fn clean_emails(emails: &[String]) -> Vec<String> {
crate::dedupe!(emails, utils::clean_email)
}
pub fn clean_phones(phones: &[String]) -> Vec<String> {
crate::dedupe!(phones, utils::clean_phone)
}