use aho_corasick::AhoCorasick;
use std::sync::LazyLock;
static VISUAL_ELEMENT_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<iframe", "<video", "<canvas", "<embed", "<object"])
.expect("valid patterns")
});
static SPA_INDICATOR_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build([
"data-reactroot",
"__next",
"id=\"app\"",
"id=\"root\"",
"ng-app",
"v-app",
"data-v-",
])
.expect("valid patterns")
});
static SVG_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<svg"])
.expect("valid patterns")
});
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct ContentAnalysis {
pub is_thin_content: bool,
pub has_visual_elements: bool,
pub has_dynamic_content: bool,
pub needs_screenshot: bool,
pub iframe_count: usize,
pub video_count: usize,
pub canvas_count: usize,
pub embed_count: usize,
pub svg_count: usize,
pub text_length: usize,
pub html_length: usize,
pub text_ratio: f32,
pub svg_bytes: usize,
pub script_bytes: usize,
pub style_bytes: usize,
pub base64_bytes: usize,
pub cleanable_bytes: usize,
pub cleanable_ratio: f32,
#[serde(default)]
pub indicators: Vec<String>,
}
impl ContentAnalysis {
const MIN_TEXT_LENGTH: usize = 200;
const MIN_TEXT_RATIO: f32 = 0.05;
pub fn analyze(html: &str) -> Self {
Self::analyze_internal(html, false)
}
pub fn analyze_full(html: &str) -> Self {
Self::analyze_internal(html, true)
}
fn analyze_internal(html: &str, calculate_sizes: bool) -> Self {
let html_bytes = html.as_bytes();
let html_length = html.len();
let mut analysis = Self {
html_length,
..Default::default()
};
for mat in VISUAL_ELEMENT_MATCHER.find_iter(html_bytes) {
match mat.pattern().as_usize() {
0 => analysis.iframe_count += 1, 1 => analysis.video_count += 1, 2 => analysis.canvas_count += 1, 3 | 4 => analysis.embed_count += 1, _ => {}
}
}
analysis.svg_count = SVG_MATCHER.find_iter(html_bytes).count();
analysis.has_dynamic_content = SPA_INDICATOR_MATCHER.find(html_bytes).is_some();
analysis.text_length = estimate_text_length(html);
if calculate_sizes {
analysis.svg_bytes = estimate_tag_bytes(html, "svg");
analysis.script_bytes = estimate_tag_bytes(html, "script");
analysis.style_bytes = estimate_tag_bytes(html, "style");
analysis.base64_bytes = estimate_base64_bytes(html);
} else {
analysis.svg_bytes = analysis.svg_count * 5_000;
analysis.script_bytes = count_script_tags_fast(html_bytes) * 10_000;
analysis.style_bytes = count_style_tags_fast(html_bytes) * 2_000;
analysis.base64_bytes = estimate_base64_bytes_fast(html_bytes);
}
analysis.cleanable_bytes = analysis.svg_bytes
+ analysis.script_bytes
+ analysis.style_bytes
+ analysis.base64_bytes;
analysis.text_ratio = if html_length > 0 {
analysis.text_length as f32 / html_length as f32
} else {
0.0
};
analysis.cleanable_ratio = if html_length > 0 {
analysis.cleanable_bytes as f32 / html_length as f32
} else {
0.0
};
analysis.is_thin_content = analysis.text_length < Self::MIN_TEXT_LENGTH
|| analysis.text_ratio < Self::MIN_TEXT_RATIO;
analysis.has_visual_elements = analysis.iframe_count > 0
|| analysis.video_count > 0
|| analysis.canvas_count > 0
|| analysis.embed_count > 0;
if analysis.is_thin_content {
analysis.indicators.push("thin_content".to_string());
}
if analysis.has_visual_elements {
analysis.indicators.push("visual_elements".to_string());
}
if analysis.has_dynamic_content {
analysis.indicators.push("dynamic_content".to_string());
}
analysis.needs_screenshot = analysis.is_thin_content
|| analysis.has_visual_elements
|| (analysis.has_dynamic_content && analysis.text_ratio < 0.1);
analysis
}
#[inline]
pub fn quick_needs_screenshot(html: &str) -> bool {
let bytes = html.as_bytes();
if VISUAL_ELEMENT_MATCHER.find(bytes).is_some() {
return true;
}
if html.len() < 1000 {
return true;
}
if SPA_INDICATOR_MATCHER.find(bytes).is_some() {
let text_len = estimate_text_length(html);
if text_len < 200 {
return true;
}
}
false
}
#[inline]
pub fn has_visual_elements_quick(html: &str) -> bool {
VISUAL_ELEMENT_MATCHER.find(html.as_bytes()).is_some()
}
pub fn recommended_cleaning(&self) -> crate::HtmlCleaningProfile {
use crate::HtmlCleaningProfile;
if self.cleanable_ratio > 0.5 {
HtmlCleaningProfile::Aggressive
} else if self.svg_bytes > 50_000 || self.base64_bytes > 50_000 {
HtmlCleaningProfile::Slim
} else if self.has_dynamic_content {
HtmlCleaningProfile::Minimal
} else if self.is_thin_content {
HtmlCleaningProfile::Minimal
} else {
HtmlCleaningProfile::Default
}
}
pub fn summary(&self) -> String {
format!(
"text={}, html={}, ratio={:.2}, cleanable={:.0}%, screenshot={}",
self.text_length,
self.html_length,
self.text_ratio,
self.cleanable_ratio * 100.0,
self.needs_screenshot
)
}
}
#[inline]
fn count_script_tags_fast(html: &[u8]) -> usize {
static SCRIPT_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<script"])
.expect("valid patterns")
});
SCRIPT_MATCHER.find_iter(html).count()
}
#[inline]
fn count_style_tags_fast(html: &[u8]) -> usize {
static STYLE_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<style"])
.expect("valid patterns")
});
STYLE_MATCHER.find_iter(html).count()
}
fn estimate_text_length(html: &str) -> usize {
let bytes = html.as_bytes();
let len = bytes.len();
let mut i = 0;
let mut in_script = false;
let mut in_style = false;
let mut text_len = 0;
while i < len {
let remaining = &bytes[i..];
let Some(lt) = memchr::memchr(b'<', remaining) else {
if !in_script && !in_style {
text_len += remaining
.iter()
.filter(|&&b| !b.is_ascii_whitespace())
.count();
}
break;
};
if !in_script && !in_style && lt > 0 {
text_len += remaining[..lt]
.iter()
.filter(|&&b| !b.is_ascii_whitespace())
.count();
}
let tag_start = i + lt;
i = tag_start + 1;
let Some(gt) = memchr::memchr(b'>', &bytes[i..]) else {
break;
};
let tag_inner = &bytes[i..i + gt]; i += gt + 1;
let name_end = tag_inner
.iter()
.position(|&b| b == b' ' || b == b'\t' || b == b'\n' || b == b'\r' || b == b'/')
.unwrap_or(tag_inner.len())
.min(20);
let name = &tag_inner[..name_end];
if name.eq_ignore_ascii_case(b"script") {
in_script = true;
} else if name.eq_ignore_ascii_case(b"/script") {
in_script = false;
} else if name.eq_ignore_ascii_case(b"style") {
in_style = true;
} else if name.eq_ignore_ascii_case(b"/style") {
in_style = false;
}
}
text_len
}
fn estimate_tag_bytes(html: &str, tag: &str) -> usize {
let bytes = html.as_bytes();
let len = bytes.len();
let open_tag = format!("<{}", tag); let close_tag = format!("</{}>", tag); let open_len = open_tag.len();
let close_len = close_tag.len();
let open_lower = open_tag.as_bytes();
let close_lower = close_tag.as_bytes();
let mut total = 0;
let mut i = 0;
while i + open_len <= len {
let Some(lt) = memchr::memchr(b'<', &bytes[i..]) else {
break;
};
let pos = i + lt;
if pos + open_len <= len && bytes[pos..pos + open_len].eq_ignore_ascii_case(open_lower) {
if let Some(close_lt) = find_ascii_case_insensitive(&bytes[pos..], close_lower) {
let end = pos + close_lt + close_len;
total += end - pos;
i = end;
continue;
} else {
break; }
}
i = pos + 1;
}
total
}
#[inline]
fn find_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() {
return Some(0);
}
let first_lower = needle[0].to_ascii_lowercase();
let first_upper = needle[0].to_ascii_uppercase();
let nlen = needle.len();
let mut offset = 0;
while offset + nlen <= haystack.len() {
let pos = memchr::memchr2(first_lower, first_upper, &haystack[offset..])?;
let abs = offset + pos;
if abs + nlen > haystack.len() {
return None;
}
if haystack[abs..abs + nlen].eq_ignore_ascii_case(needle) {
return Some(abs);
}
offset = abs + 1;
}
None
}
fn estimate_base64_bytes(html: &str) -> usize {
static DATA_FINDER: LazyLock<memchr::memmem::Finder<'static>> =
LazyLock::new(|| memchr::memmem::Finder::new(b"data:"));
let bytes = html.as_bytes();
let mut total = 0;
let mut search_start = 0;
while let Some(pos) = DATA_FINDER.find(&bytes[search_start..]) {
let abs_pos = search_start + pos;
if let Some(end) = memchr::memchr3(b'"', b'\'', b')', &bytes[abs_pos..]) {
total += end;
}
search_start = abs_pos + 5;
}
total
}
fn estimate_base64_bytes_fast(html: &[u8]) -> usize {
static DATA_URI_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["data:"])
.expect("valid patterns")
});
let count = DATA_URI_MATCHER.find_iter(html).count();
count * 5_000 }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_analysis_basic() {
let html = r#"
<html>
<head><title>Test</title></head>
<body>
<p>This is some test content with enough text to be substantial for our analysis.</p>
<p>More text here to ensure we have enough content for the analysis threshold.</p>
<p>And even more text to make sure we pass the minimum text length threshold.</p>
<p>Additional paragraph to ensure we have plenty of text content in this page.</p>
<p>The goal is to have over 200 characters of visible text in this HTML document.</p>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(!analysis.has_visual_elements);
assert!(
analysis.text_length >= 200,
"Expected 200+ chars, got {}",
analysis.text_length
);
assert!(!analysis.needs_screenshot);
}
#[test]
fn test_content_analysis_with_iframe() {
let html = r#"
<html>
<body>
<iframe src="https://example.com"></iframe>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(analysis.has_visual_elements);
assert_eq!(analysis.iframe_count, 1);
assert!(analysis.needs_screenshot);
}
#[test]
fn test_content_analysis_spa() {
let html = r#"
<html>
<body>
<div id="root" data-reactroot></div>
<script src="bundle.js"></script>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(analysis.has_dynamic_content);
assert!(analysis.is_thin_content);
}
#[test]
fn test_quick_needs_screenshot() {
assert!(ContentAnalysis::quick_needs_screenshot(
"<iframe src='x'></iframe>"
));
assert!(ContentAnalysis::quick_needs_screenshot(
"<video src='x'></video>"
));
assert!(ContentAnalysis::quick_needs_screenshot("<canvas></canvas>"));
assert!(ContentAnalysis::quick_needs_screenshot("short"));
let long_text = "a".repeat(2000);
let html = format!("<html><body><p>{}</p></body></html>", long_text);
assert!(!ContentAnalysis::quick_needs_screenshot(&html));
}
#[test]
fn test_estimate_text_length() {
let html = "<p>Hello World</p><script>console.log('ignored')</script>";
let len = estimate_text_length(html);
assert_eq!(len, 10); }
#[test]
fn test_aho_corasick_visual_elements() {
assert!(ContentAnalysis::has_visual_elements_quick(
"<IFRAME src='test'>"
));
assert!(ContentAnalysis::has_visual_elements_quick("<Video>"));
assert!(ContentAnalysis::has_visual_elements_quick("<CANVAS>"));
assert!(ContentAnalysis::has_visual_elements_quick("<embed>"));
assert!(ContentAnalysis::has_visual_elements_quick("<OBJECT>"));
assert!(!ContentAnalysis::has_visual_elements_quick(
"<div>No visuals</div>"
));
}
#[test]
fn test_spa_detection() {
let react_html = r#"<div id="root" data-reactroot></div>"#;
let analysis = ContentAnalysis::analyze(react_html);
assert!(analysis.has_dynamic_content);
let next_html = r#"<div id="__next"></div>"#;
let analysis = ContentAnalysis::analyze(next_html);
assert!(analysis.has_dynamic_content);
let vue_html = r#"<div data-v-abc123></div>"#;
let analysis = ContentAnalysis::analyze(vue_html);
assert!(analysis.has_dynamic_content);
let plain_html = r#"<div>Plain HTML</div>"#;
let analysis = ContentAnalysis::analyze(plain_html);
assert!(!analysis.has_dynamic_content);
}
#[test]
fn test_content_analysis_summary() {
let html = r#"<html><body><p>Test content here</p></body></html>"#;
let analysis = ContentAnalysis::analyze(html);
let summary = analysis.summary();
assert!(summary.contains("text="));
assert!(summary.contains("html="));
assert!(summary.contains("ratio="));
}
}