use aho_corasick::AhoCorasick;
use std::sync::LazyLock;
static VISUAL_ELEMENT_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<iframe", "<video", "<canvas", "<embed", "<object"])
.expect("valid patterns")
});
static SPA_INDICATOR_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build([
"data-reactroot",
"__next",
"id=\"app\"",
"id=\"root\"",
"ng-app",
"v-app",
"data-v-",
])
.expect("valid patterns")
});
static SVG_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<svg"])
.expect("valid patterns")
});
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct ContentAnalysis {
pub is_thin_content: bool,
pub has_visual_elements: bool,
pub has_dynamic_content: bool,
pub needs_screenshot: bool,
pub iframe_count: usize,
pub video_count: usize,
pub canvas_count: usize,
pub embed_count: usize,
pub svg_count: usize,
pub text_length: usize,
pub html_length: usize,
pub text_ratio: f32,
pub svg_bytes: usize,
pub script_bytes: usize,
pub style_bytes: usize,
pub base64_bytes: usize,
pub cleanable_bytes: usize,
pub cleanable_ratio: f32,
#[serde(default)]
pub indicators: Vec<String>,
}
impl ContentAnalysis {
const MIN_TEXT_LENGTH: usize = 200;
const MIN_TEXT_RATIO: f32 = 0.05;
pub fn analyze(html: &str) -> Self {
Self::analyze_internal(html, false)
}
pub fn analyze_full(html: &str) -> Self {
Self::analyze_internal(html, true)
}
fn analyze_internal(html: &str, calculate_sizes: bool) -> Self {
let html_bytes = html.as_bytes();
let html_length = html.len();
let mut analysis = Self {
html_length,
..Default::default()
};
for mat in VISUAL_ELEMENT_MATCHER.find_iter(html_bytes) {
match mat.pattern().as_usize() {
0 => analysis.iframe_count += 1, 1 => analysis.video_count += 1, 2 => analysis.canvas_count += 1, 3 | 4 => analysis.embed_count += 1, _ => {}
}
}
analysis.svg_count = SVG_MATCHER.find_iter(html_bytes).count();
analysis.has_dynamic_content = SPA_INDICATOR_MATCHER.find(html_bytes).is_some();
analysis.text_length = estimate_text_length(html);
if calculate_sizes {
analysis.svg_bytes = estimate_tag_bytes(html, "svg");
analysis.script_bytes = estimate_tag_bytes(html, "script");
analysis.style_bytes = estimate_tag_bytes(html, "style");
analysis.base64_bytes = estimate_base64_bytes(html);
} else {
analysis.svg_bytes = analysis.svg_count * 5_000;
analysis.script_bytes = count_script_tags_fast(html_bytes) * 10_000;
analysis.style_bytes = count_style_tags_fast(html_bytes) * 2_000;
analysis.base64_bytes = estimate_base64_bytes_fast(html_bytes);
}
analysis.cleanable_bytes = analysis.svg_bytes
+ analysis.script_bytes
+ analysis.style_bytes
+ analysis.base64_bytes;
analysis.text_ratio = if html_length > 0 {
analysis.text_length as f32 / html_length as f32
} else {
0.0
};
analysis.cleanable_ratio = if html_length > 0 {
analysis.cleanable_bytes as f32 / html_length as f32
} else {
0.0
};
analysis.is_thin_content = analysis.text_length < Self::MIN_TEXT_LENGTH
|| analysis.text_ratio < Self::MIN_TEXT_RATIO;
analysis.has_visual_elements = analysis.iframe_count > 0
|| analysis.video_count > 0
|| analysis.canvas_count > 0
|| analysis.embed_count > 0;
if analysis.is_thin_content {
analysis.indicators.push("thin_content".to_string());
}
if analysis.has_visual_elements {
analysis.indicators.push("visual_elements".to_string());
}
if analysis.has_dynamic_content {
analysis.indicators.push("dynamic_content".to_string());
}
analysis.needs_screenshot = analysis.is_thin_content
|| analysis.has_visual_elements
|| (analysis.has_dynamic_content && analysis.text_ratio < 0.1);
analysis
}
#[inline]
pub fn quick_needs_screenshot(html: &str) -> bool {
let bytes = html.as_bytes();
if VISUAL_ELEMENT_MATCHER.find(bytes).is_some() {
return true;
}
if html.len() < 1000 {
return true;
}
if SPA_INDICATOR_MATCHER.find(bytes).is_some() {
let text_len = estimate_text_length(html);
if text_len < 200 {
return true;
}
}
false
}
#[inline]
pub fn has_visual_elements_quick(html: &str) -> bool {
VISUAL_ELEMENT_MATCHER.find(html.as_bytes()).is_some()
}
pub fn recommended_cleaning(&self) -> crate::HtmlCleaningProfile {
use crate::HtmlCleaningProfile;
if self.cleanable_ratio > 0.5 {
HtmlCleaningProfile::Aggressive
} else if self.svg_bytes > 50_000 || self.base64_bytes > 50_000 {
HtmlCleaningProfile::Slim
} else if self.has_dynamic_content {
HtmlCleaningProfile::Minimal
} else if self.is_thin_content {
HtmlCleaningProfile::Minimal
} else {
HtmlCleaningProfile::Default
}
}
pub fn summary(&self) -> String {
format!(
"text={}, html={}, ratio={:.2}, cleanable={:.0}%, screenshot={}",
self.text_length,
self.html_length,
self.text_ratio,
self.cleanable_ratio * 100.0,
self.needs_screenshot
)
}
}
#[inline]
fn count_script_tags_fast(html: &[u8]) -> usize {
static SCRIPT_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<script"])
.expect("valid patterns")
});
SCRIPT_MATCHER.find_iter(html).count()
}
#[inline]
fn count_style_tags_fast(html: &[u8]) -> usize {
static STYLE_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["<style"])
.expect("valid patterns")
});
STYLE_MATCHER.find_iter(html).count()
}
fn estimate_text_length(html: &str) -> usize {
let mut in_tag = false;
let mut in_script = false;
let mut in_style = false;
let mut text_len = 0;
let mut tag_name = String::new();
for c in html.chars() {
if c == '<' {
in_tag = true;
tag_name.clear();
} else if c == '>' {
in_tag = false;
let tag_lower = tag_name.to_lowercase();
if tag_lower == "script" {
in_script = true;
} else if tag_lower == "/script" {
in_script = false;
} else if tag_lower == "style" {
in_style = true;
} else if tag_lower == "/style" {
in_style = false;
}
} else if in_tag {
if tag_name.len() < 20 {
tag_name.push(c);
}
} else if !in_script && !in_style && !c.is_whitespace() {
text_len += 1;
}
}
text_len
}
fn estimate_tag_bytes(html: &str, tag: &str) -> usize {
let open = format!("<{}", tag);
let close = format!("</{}>", tag);
let mut total = 0;
let html_lower = html.to_lowercase();
let mut search_start = 0;
while let Some(start) = html_lower[search_start..].find(&open) {
let abs_start = search_start + start;
if let Some(end_offset) = html_lower[abs_start..].find(&close) {
let end = abs_start + end_offset + close.len();
total += end - abs_start;
search_start = end;
} else {
break;
}
}
total
}
fn estimate_base64_bytes(html: &str) -> usize {
let mut total = 0;
let mut search_start = 0;
while let Some(pos) = html[search_start..].find("data:") {
let abs_pos = search_start + pos;
if let Some(end) = html[abs_pos..].find(['"', '\'', ')']) {
total += end;
}
search_start = abs_pos + 5;
}
total
}
fn estimate_base64_bytes_fast(html: &[u8]) -> usize {
static DATA_URI_MATCHER: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(true)
.build(["data:"])
.expect("valid patterns")
});
let count = DATA_URI_MATCHER.find_iter(html).count();
count * 5_000 }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_analysis_basic() {
let html = r#"
<html>
<head><title>Test</title></head>
<body>
<p>This is some test content with enough text to be substantial for our analysis.</p>
<p>More text here to ensure we have enough content for the analysis threshold.</p>
<p>And even more text to make sure we pass the minimum text length threshold.</p>
<p>Additional paragraph to ensure we have plenty of text content in this page.</p>
<p>The goal is to have over 200 characters of visible text in this HTML document.</p>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(!analysis.has_visual_elements);
assert!(
analysis.text_length >= 200,
"Expected 200+ chars, got {}",
analysis.text_length
);
assert!(!analysis.needs_screenshot);
}
#[test]
fn test_content_analysis_with_iframe() {
let html = r#"
<html>
<body>
<iframe src="https://example.com"></iframe>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(analysis.has_visual_elements);
assert_eq!(analysis.iframe_count, 1);
assert!(analysis.needs_screenshot);
}
#[test]
fn test_content_analysis_spa() {
let html = r#"
<html>
<body>
<div id="root" data-reactroot></div>
<script src="bundle.js"></script>
</body>
</html>
"#;
let analysis = ContentAnalysis::analyze(html);
assert!(analysis.has_dynamic_content);
assert!(analysis.is_thin_content);
}
#[test]
fn test_quick_needs_screenshot() {
assert!(ContentAnalysis::quick_needs_screenshot(
"<iframe src='x'></iframe>"
));
assert!(ContentAnalysis::quick_needs_screenshot(
"<video src='x'></video>"
));
assert!(ContentAnalysis::quick_needs_screenshot("<canvas></canvas>"));
assert!(ContentAnalysis::quick_needs_screenshot("short"));
let long_text = "a".repeat(2000);
let html = format!("<html><body><p>{}</p></body></html>", long_text);
assert!(!ContentAnalysis::quick_needs_screenshot(&html));
}
#[test]
fn test_estimate_text_length() {
let html = "<p>Hello World</p><script>console.log('ignored')</script>";
let len = estimate_text_length(html);
assert_eq!(len, 10); }
#[test]
fn test_aho_corasick_visual_elements() {
assert!(ContentAnalysis::has_visual_elements_quick(
"<IFRAME src='test'>"
));
assert!(ContentAnalysis::has_visual_elements_quick("<Video>"));
assert!(ContentAnalysis::has_visual_elements_quick("<CANVAS>"));
assert!(ContentAnalysis::has_visual_elements_quick("<embed>"));
assert!(ContentAnalysis::has_visual_elements_quick("<OBJECT>"));
assert!(!ContentAnalysis::has_visual_elements_quick(
"<div>No visuals</div>"
));
}
#[test]
fn test_spa_detection() {
let react_html = r#"<div id="root" data-reactroot></div>"#;
let analysis = ContentAnalysis::analyze(react_html);
assert!(analysis.has_dynamic_content);
let next_html = r#"<div id="__next"></div>"#;
let analysis = ContentAnalysis::analyze(next_html);
assert!(analysis.has_dynamic_content);
let vue_html = r#"<div data-v-abc123></div>"#;
let analysis = ContentAnalysis::analyze(vue_html);
assert!(analysis.has_dynamic_content);
let plain_html = r#"<div>Plain HTML</div>"#;
let analysis = ContentAnalysis::analyze(plain_html);
assert!(!analysis.has_dynamic_content);
}
#[test]
fn test_content_analysis_summary() {
let html = r#"<html><body><p>Test content here</p></body></html>"#;
let analysis = ContentAnalysis::analyze(html);
let summary = analysis.summary();
assert!(summary.contains("text="));
assert!(summary.contains("html="));
assert!(summary.contains("ratio="));
}
}