pub fn needs_js_rendering(html: &str) -> bool {
let check_len = html.len().min(500_000);
let lower = html[..check_len].to_lowercase();
let body_len = extract_body_text_len(&lower);
if body_len < 200 {
let spa_indicators = [
"id=\"root\"",
"id=\"app\"",
"id=\"__next\"",
"id=\"__nuxt\"",
"id=\"__gatsby\"",
"id=\"svelte\"",
"ng-app",
"data-reactroot",
"data-reactid",
"data-remix-run",
"data-sveltekit",
"data-astro-",
"<script src",
"window.__initial_state__",
"__next_data__",
"__nuxt__",
"__sveltekit_data",
"window.__remixcontext",
"window.__astro",
"gatsby-focus-wrapper",
];
if spa_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
if lower.contains("<noscript>") && lower.contains("enable javascript") {
return true;
}
if body_len < 500 {
let builder_indicators = [
"framerusercontent.com",
"webflow.io",
"wixsite.com",
"squarespace.com/universal",
];
if builder_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
if body_len < 1000 {
let script_count = lower.matches("<script").count();
if script_count >= 5 {
return true;
}
let storybook_indicators = [
"id=\"storybook-root\"",
"id=\"storybook-docs\"",
"__storybook",
"?path=/docs/",
"/iframe.html",
];
if storybook_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
false
}
pub fn looks_like_generic_bot_wall(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let body_stripped = body_html_without_scripts_lower(&lower);
let body_text = visible_text_from_stripped_html(&body_stripped);
if body_text.chars().filter(|c| !c.is_whitespace()).count() > 600 {
return false;
}
let phrases = [
"performing security verification",
"verify you are human",
"checking your browser",
"enable javascript and cookies",
"security check",
"access denied",
"request blocked",
];
phrases.iter().any(|p| body_text.contains(p))
}
pub fn looks_like_thin_html(html: &str) -> bool {
let check_len = html.len().min(500_000);
let lower = html[..check_len].to_lowercase();
extract_body_text_len(&lower) < 200
}
pub fn is_thin_markdown(markdown_len: usize) -> bool {
markdown_len < 100
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FailedRenderReason {
NextJsClientError,
ReactMinifiedError,
EmptyNextRoot,
}
impl FailedRenderReason {
pub fn as_str(self) -> &'static str {
match self {
FailedRenderReason::NextJsClientError => "nextjs_client_error",
FailedRenderReason::ReactMinifiedError => "react_minified_error",
FailedRenderReason::EmptyNextRoot => "empty_next_root",
}
}
}
pub fn looks_like_failed_render(html: &str) -> Option<FailedRenderReason> {
if html.len() > 200_000 {
return None;
}
let lower = html.to_lowercase();
if lower.contains("id=\"__next-error-") || lower.contains("data-nextjs-error") {
return Some(FailedRenderReason::NextJsClientError);
}
if lower.contains("id=\"__next_error__\"") {
return Some(FailedRenderReason::NextJsClientError);
}
if lower.contains("https://react.dev/errors/")
|| lower.contains("https://reactjs.org/docs/error-decoder")
{
return Some(FailedRenderReason::ReactMinifiedError);
}
if let Some(start) = lower.find("id=\"__next\"") {
let after_id = &lower[start..];
if let Some(close) = after_id.find('>') {
let tail = &after_id[close + 1..];
if let Some(end) = tail.find("</div>") {
let inner = tail[..end].trim();
if inner.is_empty() {
return Some(FailedRenderReason::EmptyNextRoot);
}
}
}
}
None
}
pub fn looks_like_loading_placeholder(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let body_stripped = body_html_without_scripts_lower(&lower);
let body_text = visible_text_from_stripped_html(&body_stripped);
let body_text_len = body_text.chars().filter(|c| !c.is_whitespace()).count();
if body_text_len == 0 {
return true;
}
if body_text_len < 400 {
let loading_markers = [
"loading...",
"loading…",
"please wait",
"just a moment",
"initializing",
"preparing",
"one moment",
];
if loading_markers.iter().any(|m| body_text.contains(m)) {
return true;
}
}
if body_text_len < 200 {
let spinner_markers = [
"class=\"spinner",
"class=\"loader",
"class=\"loading",
"class=\"preloader",
"id=\"loader",
"id=\"preloader",
"aria-label=\"loading\"",
];
if spinner_markers.iter().any(|m| body_stripped.contains(m)) {
return true;
}
}
false
}
fn body_html_without_scripts_lower(lower: &str) -> String {
let body_start = lower
.find("<body")
.and_then(|i| lower[i..].find('>').map(|j| i + j + 1));
let body_end = lower.rfind("</body>");
let body = match (body_start, body_end) {
(Some(start), Some(end)) if start < end => &lower[start..end],
_ => return String::new(),
};
let stripped = strip_tag_blocks(body, "script");
strip_tag_blocks(&stripped, "style")
}
fn visible_text_from_stripped_html(stripped: &str) -> String {
let mut text = String::with_capacity(stripped.len());
let mut in_tag = false;
let mut prev_ws = true;
for ch in stripped.chars() {
if ch == '<' {
in_tag = true;
} else if ch == '>' {
in_tag = false;
} else if !in_tag {
if ch.is_whitespace() {
if !prev_ws {
text.push(' ');
prev_ws = true;
}
} else {
text.push(ch);
prev_ws = false;
}
}
}
text
}
fn extract_body_text_len(lower: &str) -> usize {
if !lower.contains("<body") {
return 1000;
}
let stripped = body_html_without_scripts_lower(lower);
visible_text_from_stripped_html(&stripped)
.chars()
.filter(|c| !c.is_whitespace())
.count()
}
fn strip_tag_blocks(html: &str, tag: &str) -> String {
let mut result = String::with_capacity(html.len());
let open = format!("<{}", tag);
let close = format!("</{}>", tag);
let mut remaining = html;
while let Some(start) = remaining.find(&open) {
result.push_str(&remaining[..start]);
let after_open = &remaining[start..];
if let Some(end) = after_open.find(&close) {
remaining = &after_open[end + close.len()..];
} else {
remaining = "";
break;
}
}
result.push_str(remaining);
result
}
pub fn looks_like_cloudflare_challenge(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let strong = [
"cf-browser-verification",
"cf-challenge-running",
"/cdn-cgi/challenge-platform/",
"_cf_chl_opt",
"__cf_chl_managed_tk__",
"window._cf_chl_opt",
];
if strong.iter().any(|m| lower.contains(m)) {
return true;
}
let weak = [
"just a moment",
"checking your browser",
"attention required",
"performance & security by cloudflare",
"performance & security by cloudflare",
];
let weak_hits = weak.iter().filter(|m| lower.contains(*m)).count();
weak_hits >= 2
}
pub fn is_cloudflare_mitigated_header(header_value: &str) -> bool {
let lower = header_value.trim().to_ascii_lowercase();
matches!(lower.as_str(), "challenge" | "block")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_spa_shell() {
let html = r#"<html><head></head><body><div id="root"></div><script src="/app.js"></script></body></html>"#;
assert!(needs_js_rendering(html));
}
#[test]
fn static_page_no_js_needed() {
let html = r#"<html><body><article><h1>Hello World</h1><p>This is a long article with plenty of text content to read and enjoy. It has multiple paragraphs and lots of useful information.</p></article></body></html>"#;
assert!(!needs_js_rendering(html));
}
#[test]
fn detects_loading_placeholder_text() {
let html =
r#"<html><body><div><p>Loading...</p><p>Hi! Ask me anything.</p></div></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn detects_spinner_only_body() {
let html = r#"<html><body><div class="spinner"></div></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn real_content_not_placeholder() {
let html = r#"<html><body><article><h1>Welcome to my creative space</h1><p>Waqar Bin Abrar is a full stack developer specializing in MERN stack and Flutter apps, building scalable digital solutions for clients worldwide.</p><p>With years of experience delivering production applications, he combines technical expertise with design sensibility.</p></article></body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
#[test]
fn logo_alt_loading_on_real_page_not_placeholder() {
let html = r#"<html><body>
<header><img alt="Loading..." src="/logo.png"/></header>
<article>
<h1>Software Engineering Blog</h1>
<p>Thoughts on distributed systems, programming languages, and the craft of writing software that lasts. New posts weekly.</p>
<p>This site covers topics from Rust ownership to Kubernetes operators.</p>
</article>
</body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
#[test]
fn empty_body_is_placeholder() {
let html = r#"<html><body></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn large_page_never_placeholder() {
let filler = "x".repeat(100_000);
let html = format!("<html><body><p>Loading...</p>{filler}</body></html>");
assert!(!looks_like_loading_placeholder(&html));
}
#[test]
fn detects_nextjs_app_router_error_boundary() {
let html = r#"<html><body><div id="__next-error-0"><h2>Application error: a client-side exception has occurred.</h2></div></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::NextJsClientError)
);
}
#[test]
fn detects_nextjs_pages_router_error() {
let html = r#"<html><body><div id="__next_error__">oops</div></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::NextJsClientError)
);
}
#[test]
fn detects_react_minified_error() {
let html = r#"<html><body><a href="https://react.dev/errors/418">Minified React error #418</a></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::ReactMinifiedError)
);
}
#[test]
fn detects_legacy_react_error_decoder_url() {
let html = r#"<html><body><a href="https://reactjs.org/docs/error-decoder.html?invariant=31">React</a></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::ReactMinifiedError)
);
}
#[test]
fn blog_post_about_error_is_not_failed_render() {
let html = r#"<html><body><article><h1>Debugging Next.js</h1>
<p>When you see "Application error: a client-side exception has occurred",
it usually means a hydration mismatch.</p>
<pre><code>console.log('debug')</code></pre>
</article></body></html>"#;
assert!(looks_like_failed_render(html).is_none());
}
#[test]
fn healthy_page_is_not_failed_render() {
let html =
r#"<html><body><main><h1>Hello</h1><p>Real content here.</p></main></body></html>"#;
assert!(looks_like_failed_render(html).is_none());
}
#[test]
fn huge_page_is_not_scanned() {
let mut html = String::from(r#"<html><body><div id="__next-error-0"></div>"#);
html.push_str(&"<p>filler</p>".repeat(20_000));
html.push_str("</body></html>");
assert!(html.len() > 200_000);
assert!(looks_like_failed_render(&html).is_none());
}
#[test]
fn cf_strong_marker_detected() {
let html =
r#"<html><body><div id="cf-browser-verification">Just a moment...</div></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_managed_token_detected() {
let html = r#"<html><body><script>window._cf_chl_opt={cvId:'2'};</script></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_single_weak_marker_not_enough() {
let html = r#"<html><body><article><h1>Why we use Cloudflare</h1><p>Performance benefits.</p></article></body></html>"#;
assert!(!looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_two_weak_markers_trigger() {
let html =
r#"<html><body><h1>Just a moment...</h1><p>Checking your browser...</p></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_ray_id_alone_does_not_trigger() {
let html = r#"<html><body><h1>About</h1><p>Hosted via Cloudflare.</p><footer>Ray ID: abc123</footer></body></html>"#;
assert!(!looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_mitigated_header_challenge() {
assert!(is_cloudflare_mitigated_header("challenge"));
assert!(is_cloudflare_mitigated_header(" CHALLENGE "));
assert!(is_cloudflare_mitigated_header("block"));
}
#[test]
fn cf_mitigated_header_other_values() {
assert!(!is_cloudflare_mitigated_header(""));
assert!(!is_cloudflare_mitigated_header("ok"));
assert!(!is_cloudflare_mitigated_header("verified"));
}
#[test]
fn cf_huge_page_not_scanned() {
let mut html = String::from(r#"<html><body><div id="cf-browser-verification">"#);
html.push_str(&"<p>x</p>".repeat(20_000));
html.push_str("</div></body></html>");
assert!(html.len() > 80_000);
assert!(!looks_like_cloudflare_challenge(&html));
}
#[test]
fn spinner_class_in_script_body_ignored() {
let html = r#"<html><body><article><h1>Real Article</h1><p>This is a real article with substantial content about the topic at hand, providing useful information.</p><script>const x = 'class="spinner"';</script></article></body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
}