#![cfg(test)]
mod tests {
use crate::tools::clean::utils::{
canonicalize_domain, canonicalize_url, clean_email, clean_phone, decode_html_entities,
};
use crate::tools::clean::*;
#[test]
fn test_html_entities_named() {
assert_eq!(clean_text("&"), "&");
assert_eq!(clean_text("<"), "<");
assert_eq!(clean_text(">"), ">");
assert_eq!(clean_text("""), "\"");
assert_eq!(clean_text("'"), "'");
assert_eq!(clean_text(" "), "");
}
#[test]
fn test_html_entities_numeric() {
assert_eq!(clean_text("'"), "'");
assert_eq!(clean_text("'"), "'");
assert_eq!(clean_text("""), "\"");
assert_eq!(clean_text("""), "\"");
}
#[test]
fn test_html_entities_combined() {
assert_eq!(clean_text("<div>"), "<div>");
assert_eq!(clean_text("Tom & Jerry"), "Tom & Jerry");
assert_eq!(
clean_text("It's "great""),
"It's \"great\""
);
}
#[test]
fn test_unicode_normalization() {
let precomposed = "\u{00E9}"; let decomposed = "e\u{0301}";
assert_eq!(clean_text(precomposed), clean_text(decomposed));
assert_eq!(clean_text(precomposed), "é");
}
#[test]
fn test_zero_width_characters() {
assert_eq!(clean_text("hello\u{200B}world"), "helloworld");
assert_eq!(clean_text("test\u{200C}ing"), "testing");
assert_eq!(clean_text("word\u{200D}join"), "wordjoin");
assert_eq!(clean_text("\u{FEFF}text"), "text");
}
#[test]
fn test_control_characters() {
assert_eq!(clean_text("hello\x00world"), "helloworld");
assert_eq!(clean_text("test\x01ing"), "testing");
assert_eq!(clean_text("line1\nline2"), "line1 line2"); assert_eq!(clean_text("tab\there"), "tab here"); }
#[test]
fn test_clean_text_handles_crlf_mixture() {
let mixed = "First\r\nSecond\rThird\tFourth";
assert_eq!(clean_text(mixed), "First Second Third Fourth");
}
#[test]
fn test_whitespace_normalization() {
assert_eq!(clean_text("hello world"), "hello world");
assert_eq!(clean_text(" trim me "), "trim me");
assert_eq!(clean_text("multiple\n\n\nlines"), "multiple lines");
assert_eq!(clean_text("lots\t\t\tof\t\ttabs"), "lots of tabs");
assert_eq!(
clean_text(" leading and trailing "),
"leading and trailing"
);
}
#[test]
fn test_combined_cleaning() {
let dirty = "Hello & 'world' with \u{200B}spaces\x00";
assert_eq!(clean_text(dirty), "Hello & 'world' with spaces");
}
#[test]
fn test_real_world_examples() {
assert_eq!(
clean_text("Ben & Jerry's Ice Cream"),
"Ben & Jerry's Ice Cream"
);
assert_eq!(
clean_text("there are too many spaces!"),
"there are too many spaces!"
);
assert_eq!(
clean_text(" <b>Bold</b> text "),
"<b>Bold</b> text"
);
}
#[test]
fn test_empty_and_whitespace() {
assert_eq!(clean_text(""), "");
assert_eq!(clean_text(" "), "");
assert_eq!(clean_text("\n\n\n"), "");
assert_eq!(clean_text("\t\t\t"), "");
}
#[test]
fn test_no_changes_needed() {
assert_eq!(clean_text("perfect text"), "perfect text");
assert_eq!(clean_text("no entities here"), "no entities here");
}
#[test]
fn test_preserves_intentional_characters() {
assert_eq!(clean_text("hello-world"), "hello-world");
assert_eq!(clean_text("under_score"), "under_score");
assert_eq!(clean_text("with.period"), "with.period");
assert_eq!(clean_text("a/b/c"), "a/b/c");
}
#[test]
fn test_unicode_characters() {
assert_eq!(clean_text("Hello 👋 World 🌍"), "Hello 👋 World 🌍");
assert_eq!(clean_text("Café"), "Café");
assert_eq!(clean_text("日本語"), "日本語");
}
#[tokio::test]
async fn test_clean_html_removes_comments() {
assert_eq!(
clean_html("<div><!-- comment --><p>Text</p></div>").await,
"<div><p>Text</p></div>"
);
assert_eq!(
clean_html("<!-- start --><p>Content</p><!-- end -->").await,
"<p>Content</p>"
);
}
#[tokio::test]
async fn test_clean_html_multiline_comments() {
assert_eq!(
clean_html("<div><!--\nmultiline\ncomment\n--><p>Text</p></div>").await,
"<div><p>Text</p></div>"
);
}
#[tokio::test]
async fn test_clean_html_normalizes_whitespace() {
assert_eq!(
clean_html("<div> <p>Text</p> </div>").await,
"<div> <p>Text</p> </div>"
);
}
#[tokio::test]
async fn test_clean_html_removes_empty_lines() {
assert_eq!(
clean_html("<div>\n\n<p>Text</p>\n\n</div>").await,
"<div> <p>Text</p> </div>"
);
}
#[tokio::test]
async fn test_clean_html_combined() {
let dirty = "<div> <!-- comment --> \n\n<p>Text</p> \n\n </div>";
let expected = "<div> <p>Text</p> </div>";
assert_eq!(clean_html(dirty).await, expected);
}
#[tokio::test]
async fn test_clean_html_removes_scripts() {
assert_eq!(
clean_html("<div><script>alert('hi')</script><p>Text</p></div>").await,
"<div><p>Text</p></div>"
);
assert_eq!(
clean_html("<script src='app.js'></script><p>Content</p>").await,
"<p>Content</p>"
);
}
#[tokio::test]
async fn test_clean_html_removes_styles() {
assert_eq!(
clean_html("<div><style>.red{color:red}</style><p>Text</p></div>").await,
"<div><p>Text</p></div>"
);
}
#[tokio::test]
async fn test_clean_html_removes_noscript_iframe_svg() {
assert_eq!(
clean_html("<noscript>Enable JS</noscript><p>Text</p>").await,
"<p>Text</p>"
);
assert_eq!(
clean_html("<iframe src='ad.html'></iframe><p>Text</p>").await,
"<p>Text</p>"
);
assert_eq!(
clean_html("<svg><circle/></svg><p>Text</p>").await,
"<p>Text</p>"
);
}
#[tokio::test]
async fn test_clean_html_removes_junk_attributes() {
assert_eq!(
clean_html("<div class='container' id='main' style='color:red'>Text</div>").await,
"<div>Text</div>"
);
assert_eq!(
clean_html("<button onclick='alert()' data-id='123'>Click</button>").await,
"<button>Click</button>"
);
assert_eq!(
clean_html("<div aria-label='info' role='button'>Text</div>").await,
"<div>Text</div>"
);
}
#[tokio::test]
async fn test_clean_html_preserves_jsonld() {
let html = r#"<script>alert('bad')</script><script type="application/ld+json">{"@context":"schema.org"}</script><p>Text</p>"#;
let cleaned = clean_html(html).await;
assert!(cleaned.contains(r#"<script type="application/ld+json">"#));
assert!(cleaned.contains(r#"{"@context":"schema.org"}"#));
assert!(!cleaned.contains("alert('bad')"));
}
#[tokio::test]
async fn test_clean_html_handles_uppercase_tags() {
let html = "<DIV><SCRIPT>alert('hi')</SCRIPT><STYLE>body{}</STYLE><P>Text</P></DIV>";
assert_eq!(clean_html(html).await, "<DIV><P>Text</P></DIV>");
}
#[tokio::test]
async fn test_clean_html_preserves_jsonld_case_variants() {
let html = r#"<SCRIPT TYPE="APPLICATION/LD+JSON">{"@type":"Thing"}</SCRIPT><p>Text</p>"#;
let cleaned = clean_html(html).await;
assert!(cleaned
.to_lowercase()
.contains(r#"<script type="application/ld+json">"#));
assert!(cleaned.contains(r#"{"@type":"Thing"}"#));
}
#[tokio::test]
async fn test_clean_html_normalizes_escaped_newlines() {
let html = "<div>Line\\nBreak</div>";
assert_eq!(clean_html(html).await, "<div>Line Break</div>");
}
#[test]
fn test_clean_urls_exact_duplicates() {
let urls = vec![
"https://example.com".to_string(),
"https://example.com".to_string(),
"https://example.com".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
assert_eq!(cleaned[0], "https://example.com");
}
#[test]
fn test_clean_urls_protocol_normalization() {
let urls = vec![
"http://example.com".to_string(),
"https://example.com".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1); }
#[test]
fn test_clean_urls_case_normalization() {
let urls = vec![
"https://Example.com".to_string(),
"https://EXAMPLE.COM".to_string(),
"https://example.com".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
}
#[test]
fn test_clean_urls_www_stripping() {
let urls = vec![
"https://www.example.com".to_string(),
"https://example.com".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
}
#[test]
fn test_clean_urls_trailing_slash() {
let urls = vec![
"https://example.com/path".to_string(),
"https://example.com/path/".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
}
#[test]
fn test_clean_urls_query_param_order() {
let urls = vec![
"https://example.com?b=2&a=1".to_string(),
"https://example.com?a=1&b=2".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
}
#[test]
fn test_clean_urls_fragment_removal() {
let urls = vec![
"https://example.com/page#section1".to_string(),
"https://example.com/page#section2".to_string(),
"https://example.com/page".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1);
}
#[test]
fn test_clean_urls_combined() {
let urls = vec![
"https://example.com/path".to_string(),
"HTTP://www.example.com/path/".to_string(),
"https://EXAMPLE.COM/path".to_string(),
"http://example.com/path#frag".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 1); }
#[test]
fn test_clean_urls_preserves_order() {
let urls = vec![
"https://example.com/first".to_string(),
"https://example.com/second".to_string(),
"https://example.com/first".to_string(), ];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 2);
assert_eq!(cleaned[0], "https://example.com/first");
assert_eq!(cleaned[1], "https://example.com/second");
}
#[test]
fn test_clean_urls_returns_canonical() {
let urls = vec!["HTTP://Example.com/Path".to_string()];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned[0], "https://example.com/Path");
}
#[test]
fn test_clean_urls_canonicalizes_idna_domains() {
let urls = vec!["https://münich.com/path".to_string()];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned[0], "https://xn--mnich-kva.com/path");
}
#[test]
fn test_clean_urls_malformed() {
let urls = vec![
"not-a-url".to_string(),
"https://example.com".to_string(),
"also-not-url".to_string(),
];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 3); }
#[test]
fn test_clean_urls_empty_list() {
let urls: Vec<String> = vec![];
let cleaned = clean_urls(&urls);
assert_eq!(cleaned.len(), 0);
}
#[test]
fn test_clean_emails_deduplication() {
let emails = vec![
" john@example.com ".to_string(),
"John@Example.COM".to_string(),
"\"John Doe\" <john@example.com>".to_string(),
];
let cleaned = clean_emails(&emails);
assert_eq!(cleaned.len(), 1);
assert_eq!(cleaned[0], "john@example.com");
}
#[test]
fn test_clean_emails_with_punctuation() {
let emails = vec![
"john@example.com,".to_string(),
"john@example.com".to_string(),
];
let cleaned = clean_emails(&emails);
assert_eq!(cleaned.len(), 1);
assert_eq!(cleaned[0], "john@example.com");
}
#[test]
fn test_clean_emails_filters_empty() {
let emails = vec![
"john@example.com".to_string(),
"".to_string(),
"jane@example.com".to_string(),
" ".to_string(),
];
let cleaned = clean_emails(&emails);
assert_eq!(cleaned.len(), 2);
}
#[test]
fn test_clean_emails_preserves_order() {
let emails = vec![
"first@example.com".to_string(),
"second@example.com".to_string(),
"first@example.com".to_string(), ];
let cleaned = clean_emails(&emails);
assert_eq!(cleaned.len(), 2);
assert_eq!(cleaned[0], "first@example.com");
assert_eq!(cleaned[1], "second@example.com");
}
#[test]
fn test_clean_emails_empty_list() {
let emails: Vec<String> = vec![];
let cleaned = clean_emails(&emails);
assert_eq!(cleaned.len(), 0);
}
#[test]
fn test_clean_phones_deduplication() {
let phones = vec![
"(555) 123-4567".to_string(),
"555-123-4567".to_string(),
"555.123.4567".to_string(),
];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 1);
assert_eq!(cleaned[0], "5551234567");
}
#[test]
fn test_clean_phones_filters_empty() {
let phones = vec![
"555-123-4567".to_string(),
"".to_string(),
"555-987-6543".to_string(),
" ".to_string(),
];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 2);
}
#[test]
fn test_clean_phones_preserves_order() {
let phones = vec![
"555-123-4567".to_string(),
"555-987-6543".to_string(),
"(555) 123-4567".to_string(), ];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 2);
assert_eq!(cleaned[0], "5551234567");
assert_eq!(cleaned[1], "5559876543");
}
#[test]
fn test_clean_phones_with_extensions() {
let phones = vec![
"555-123-4567 ext. 123".to_string(),
"555-123-4567 x456".to_string(),
];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 1); assert_eq!(cleaned[0], "5551234567");
}
#[test]
fn test_clean_phones_international_vs_local() {
let phones = vec!["+1-555-123-4567".to_string(), "555-123-4567".to_string()];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 2);
assert_eq!(cleaned[0], "+15551234567");
assert_eq!(cleaned[1], "5551234567");
}
#[test]
fn test_clean_phones_empty_list() {
let phones: Vec<String> = vec![];
let cleaned = clean_phones(&phones);
assert_eq!(cleaned.len(), 0);
}
#[test]
fn test_decode_html_entities() {
assert_eq!(decode_html_entities("&"), "&");
assert_eq!(decode_html_entities("<>"), "<>");
assert_eq!(decode_html_entities("""), "\"");
assert_eq!(decode_html_entities("'"), "'");
assert_eq!(decode_html_entities("'"), "'");
assert_eq!(decode_html_entities(" "), "\u{00A0}");
}
#[test]
fn test_canonicalize_domain() {
assert_eq!(canonicalize_domain("Example.com"), "example.com");
assert_eq!(canonicalize_domain("GITHUB.COM"), "github.com");
assert_eq!(canonicalize_domain("www.example.com"), "example.com");
assert_eq!(canonicalize_domain("WWW.EXAMPLE.COM"), "example.com");
assert_eq!(canonicalize_domain("WWW.GitHub.com"), "github.com");
assert_eq!(canonicalize_domain("www"), "www");
assert_eq!(canonicalize_domain("api.example.com"), "api.example.com");
assert_eq!(
canonicalize_domain("www.api.example.com"),
"api.example.com"
);
}
#[test]
fn test_canonicalize_url() {
assert_eq!(
canonicalize_url("http://example.com"),
"https://example.com"
);
assert_eq!(canonicalize_url("example.com"), "https://example.com");
assert_eq!(canonicalize_url("www.example.com"), "https://example.com");
assert_eq!(
canonicalize_url("https://WWW.Example.COM"),
"https://example.com"
);
assert_eq!(
canonicalize_url("https://example.com/path/"),
"https://example.com/path"
);
assert_eq!(
canonicalize_url("https://example.com/"),
"https://example.com"
);
assert_eq!(
canonicalize_url("https://example.com?b=2&a=1"),
"https://example.com/?a=1&b=2"
);
assert_eq!(
canonicalize_url("https://example.com/page#section"),
"https://example.com/page"
);
assert_eq!(
canonicalize_url("HTTP://www.Example.COM/path/?b=2&a=1#frag"),
"https://example.com/path?a=1&b=2"
);
assert_eq!(canonicalize_url("not-a-url"), "not-a-url");
}
#[test]
fn test_canonical_url_new_canonicalizes() {
let url = CanonicalUrl::new("HTTP://www.Example.COM/path/?utm_source=x&id=7");
assert_eq!(url.as_str(), "https://example.com/path?id=7");
}
#[test]
fn test_canonical_url_idempotent() {
let once = CanonicalUrl::new("https://Example.com/");
let twice = CanonicalUrl::new(once.as_str());
assert_eq!(once, twice);
}
#[test]
fn test_canonical_url_serialize_transparent() {
let url = CanonicalUrl::new("https://example.com");
let json = serde_json::to_string(&url).unwrap();
assert_eq!(json, "\"https://example.com\"");
}
#[test]
fn test_canonical_url_deserialize_strict() {
let url: CanonicalUrl =
serde_json::from_str("\"HTTP://www.Example.COM/path/?utm_source=x\"").unwrap();
assert_eq!(url.as_str(), "https://example.com/path");
}
#[test]
fn test_canonical_url_from_str_and_string() {
let a: CanonicalUrl = "https://Example.com/".into();
let b: CanonicalUrl = String::from("https://Example.com/").into();
assert_eq!(a, b);
assert_eq!(a.as_str(), "https://example.com");
}
#[test]
fn test_canonicalize_url_strips_tracking_params() {
assert_eq!(
canonicalize_url("https://example.com?utm_source=x&utm_medium=y"),
"https://example.com"
);
assert_eq!(
canonicalize_url("https://example.com?utm_source=x&id=7"),
"https://example.com/?id=7"
);
for param in &[
"fbclid", "gclid", "mc_eid", "mc_cid", "_ga", "igshid", "ref_src", "ref_url",
] {
assert_eq!(
canonicalize_url(&format!("https://example.com?{}=abc&foo=1", param)),
"https://example.com/?foo=1",
"tracking param {} should be stripped",
param
);
}
assert_eq!(
canonicalize_url("https://example.com?referral=abc"),
"https://example.com/?referral=abc"
);
assert_eq!(
canonicalize_url("https://example.com?utmx=abc"),
"https://example.com/?utmx=abc"
);
assert_eq!(
canonicalize_url("https://example.com?ref=branch"),
"https://example.com/?ref=branch"
);
assert_eq!(
canonicalize_url("https://example.com/recipe?fbclid=abc"),
canonicalize_url("https://example.com/recipe?utm_source=fb"),
);
}
#[test]
fn test_clean_email() {
assert_eq!(clean_email(" john@example.com "), "john@example.com");
assert_eq!(clean_email("John@Example.COM"), "john@example.com");
assert_eq!(clean_email("john%40example.com"), "john@example.com");
assert_eq!(
clean_email("\"John Doe\" <john@example.com>"),
"john@example.com"
);
assert_eq!(
clean_email("John Doe <john@example.com>"),
"john@example.com"
);
assert_eq!(clean_email("john@example.com,"), "john@example.com");
assert_eq!(clean_email("john@example.com;"), "john@example.com");
assert_eq!(clean_email("john@example.com."), "john@example.com");
assert_eq!(
clean_email(" \"John\" <John@Example.COM> "),
"john@example.com"
);
assert_eq!(clean_email("john@example.com"), "john@example.com");
}
#[test]
fn test_clean_phone() {
assert_eq!(clean_phone("(555) 123-4567"), "5551234567");
assert_eq!(clean_phone("555-123-4567"), "5551234567");
assert_eq!(clean_phone("555.123.4567"), "5551234567");
assert_eq!(clean_phone("+1-555-123-4567"), "+15551234567");
assert_eq!(clean_phone("+1 (555) 123-4567"), "+15551234567");
assert_eq!(clean_phone("555-123-4567 ext. 123"), "5551234567");
assert_eq!(clean_phone("555-123-4567 x123"), "5551234567");
assert_eq!(clean_phone("555-123-4567 extension 123"), "5551234567");
assert_eq!(clean_phone(" 555-123-4567 "), "5551234567");
assert_eq!(clean_phone("5551234567"), "5551234567");
}
}