use ssukka::Obfuscator;
use std::collections::HashSet;
const ARTICLE: &str = r#"<!DOCTYPE html>
<html><head><title>Quarterly Report</title><style>.x{color:red}</style></head>
<body>
<article class="content">
<h1>Acme Corporation Quarterly Earnings</h1>
<p>Revenue reached substantial figures this quarter, driven by international expansion and several new product launches across emerging markets.</p>
<p>The board approved a dividend payable to every shareholder of record before the announced deadline.</p>
</article>
<script>console.log("tracking");</script>
</body></html>"#;
fn naive_text(html: &str) -> HashSet<String> {
let mut s = remove_blocks(html, "script");
s = remove_blocks(&s, "style");
while let Some(start) = s.find("<!--") {
match s[start..].find("-->") {
Some(end) => s.replace_range(start..start + end + 3, ""),
None => break,
}
}
let mut text = String::with_capacity(s.len());
let mut in_tag = false;
for c in s.chars() {
match c {
'<' => in_tag = true,
'>' => {
in_tag = false;
text.push(' ');
},
_ if !in_tag => text.push(c),
_ => {},
}
}
decode_entities(&text)
.split(|c: char| !c.is_alphanumeric())
.filter(|t| t.len() >= 3)
.map(str::to_lowercase)
.collect()
}
fn remove_blocks(s: &str, tag: &str) -> String {
let (open, close) = (format!("<{tag}"), format!("</{tag}>"));
let mut out = s.to_string();
loop {
let lower = out.to_ascii_lowercase();
let Some(start) = lower.find(&open) else { break };
let Some(rel) = lower[start..].find(&close) else { break };
out.replace_range(start..start + rel + close.len(), " ");
}
out
}
fn decode_entities(s: &str) -> String {
let chars: Vec<char> = s.chars().collect();
let mut out = String::with_capacity(s.len());
let mut i = 0;
while i < chars.len() {
if chars[i] == '&' {
if let Some(semi) = chars[i..].iter().position(|&c| c == ';') {
let entity: String = chars[i + 1..i + semi].iter().collect();
if let Some(ch) = decode_one(&entity) {
out.push(ch);
i += semi + 1;
continue;
}
}
}
out.push(chars[i]);
i += 1;
}
out
}
fn decode_one(e: &str) -> Option<char> {
if let Some(hex) = e.strip_prefix("#x").or_else(|| e.strip_prefix("#X")) {
return u32::from_str_radix(hex, 16).ok().and_then(char::from_u32);
}
if let Some(dec) = e.strip_prefix('#') {
return dec.parse::<u32>().ok().and_then(char::from_u32);
}
match e {
"amp" => Some('&'),
"lt" => Some('<'),
"gt" => Some('>'),
"quot" => Some('"'),
"apos" => Some('\''),
"nbsp" => Some('\u{00a0}'),
_ => None,
}
}
fn token_recall(reference: &HashSet<String>, candidate: &HashSet<String>) -> f64 {
if reference.is_empty() {
return 1.0;
}
let hit = reference.iter().filter(|t| candidate.contains(*t)).count();
hit as f64 / reference.len() as f64
}
#[test]
fn structural_obfuscation_starves_non_js_extraction() {
let reference = naive_text(ARTICLE);
let obf = Obfuscator::builder()
.seed(1)
.structural_obfuscation(true)
.build()
.obfuscate(ARTICLE)
.unwrap();
let recall = token_recall(&reference, &naive_text(&obf));
println!("structural: non-JS extractor recall = {recall:.2} (lower is better)");
assert!(
recall < 0.3,
"structural should starve naive extraction, got {recall:.2}"
);
}
#[test]
fn cosmetic_default_is_only_friction() {
let reference = naive_text(ARTICLE);
let obf = Obfuscator::builder().seed(1).build().obfuscate(ARTICLE).unwrap();
let recall = token_recall(&reference, &naive_text(&obf));
println!("cosmetic default: non-JS extractor recall = {recall:.2} (friction only)");
assert!(
recall > 0.8,
"cosmetic obfuscation is friction, text recovers, got {recall:.2}"
);
}
#[test]
fn comment_split_breaks_substring_search_only() {
let obf = Obfuscator::builder()
.seed(1)
.split_words(true)
.encode_text_entities(false)
.build()
.obfuscate(ARTICLE)
.unwrap();
assert!(
!obf.contains("shareholder"),
"comment-split should fragment words in raw HTML"
);
let recall = token_recall(&naive_text(ARTICLE), &naive_text(&obf));
println!("comment-split: substring-hidden, DOM recall = {recall:.2}");
assert!(recall > 0.8, "comment-split does not stop a comment-dropping extractor");
}