use regex::Regex;
use std::sync::LazyLock;
static AKAMAI_REF_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"Reference #\d+\.[0-9a-f]+\.\d+\.[0-9a-f]+").expect("static regex")
});
pub fn needs_js_rendering(html: &str) -> bool {
let check_len = html.len().min(500_000);
let lower = html[..check_len].to_lowercase();
let body_len = extract_body_text_len(&lower);
if body_len < 200 {
let spa_indicators = [
"id=\"root\"",
"id=\"app\"",
"id=\"__next\"",
"id=\"__nuxt\"",
"id=\"__gatsby\"",
"id=\"svelte\"",
"ng-app",
"data-reactroot",
"data-reactid",
"data-remix-run",
"data-sveltekit",
"data-astro-",
"<script src",
"window.__initial_state__",
"__next_data__",
"__nuxt__",
"__sveltekit_data",
"window.__remixcontext",
"window.__astro",
"gatsby-focus-wrapper",
];
if spa_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
if lower.contains("<noscript>") && lower.contains("enable javascript") {
return true;
}
if body_len < 500 {
let builder_indicators = [
"framerusercontent.com",
"webflow.io",
"wixsite.com",
"squarespace.com/universal",
];
if builder_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
if body_len < 1000 {
let script_count = lower.matches("<script").count();
if script_count >= 5 {
return true;
}
let storybook_indicators = [
"id=\"storybook-root\"",
"id=\"storybook-docs\"",
"__storybook",
"?path=/docs/",
"/iframe.html",
];
if storybook_indicators.iter().any(|ind| lower.contains(ind)) {
return true;
}
}
false
}
pub fn looks_like_generic_bot_wall(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let body_stripped = body_html_without_scripts_lower(&lower);
let body_text = visible_text_from_stripped_html(&body_stripped);
if body_text.chars().filter(|c| !c.is_whitespace()).count() > 600 {
return false;
}
let phrases = [
"performing security verification",
"verify you are human",
"checking your browser",
"enable javascript and cookies",
"security check",
"access denied",
"request blocked",
"the request could not be satisfied",
"generated by cloudfront",
"configured to block access",
];
phrases.iter().any(|p| body_text.contains(p))
}
pub fn looks_like_vendor_block(html: &str) -> Option<&'static str> {
if html.len() > 200_000 {
return None;
}
let head = &html[..html.len().min(15_000)];
let lower_head = head.to_lowercase();
if (lower_head.contains("challenge-form") && lower_head.contains("__cf_chl_f_tk="))
|| lower_head.contains("cf-error-code")
|| lower_head.contains("/cdn-cgi/challenge-platform/")
{
return Some("cloudflare");
}
if lower_head.contains("pardon our interruption") || AKAMAI_REF_RE.is_match(head) {
return Some("akamai");
}
if lower_head.contains("window._pxappid =") || lower_head.contains("captcha.px-cdn.net") {
return Some("perimeterx");
}
if lower_head.contains("captcha-delivery.com") {
return Some("datadome");
}
if lower_head.contains("_incapsula_resource") || lower_head.contains("incapsula incident id") {
return Some("imperva");
}
if lower_head.contains("sucuri website firewall") {
return Some("sucuri");
}
if lower_head.contains("kpsdk.scriptstart = kpsdk.now()") {
return Some("kasada");
}
if lower_head.contains("generated by cloudfront")
|| lower_head.contains("the request could not be satisfied")
{
return Some("cloudfront");
}
None
}
pub fn looks_like_thin_html(html: &str) -> bool {
let check_len = html.len().min(500_000);
let lower = html[..check_len].to_lowercase();
extract_body_text_len(&lower) < 200
}
pub fn is_thin_markdown(markdown_len: usize) -> bool {
markdown_len < 100
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FailedRenderReason {
NextJsClientError,
ReactMinifiedError,
EmptyNextRoot,
}
impl FailedRenderReason {
pub fn as_str(self) -> &'static str {
match self {
FailedRenderReason::NextJsClientError => "nextjs_client_error",
FailedRenderReason::ReactMinifiedError => "react_minified_error",
FailedRenderReason::EmptyNextRoot => "empty_next_root",
}
}
}
pub fn looks_like_failed_render(html: &str) -> Option<FailedRenderReason> {
if html.len() > 200_000 {
return None;
}
let lower = html.to_lowercase();
if lower.contains("id=\"__next-error-") || lower.contains("data-nextjs-error") {
return Some(FailedRenderReason::NextJsClientError);
}
if lower.contains("id=\"__next_error__\"") {
return Some(FailedRenderReason::NextJsClientError);
}
if lower.contains("https://react.dev/errors/")
|| lower.contains("https://reactjs.org/docs/error-decoder")
{
return Some(FailedRenderReason::ReactMinifiedError);
}
if let Some(start) = lower.find("id=\"__next\"") {
let after_id = &lower[start..];
if let Some(close) = after_id.find('>') {
let tail = &after_id[close + 1..];
if let Some(end) = tail.find("</div>") {
let inner = tail[..end].trim();
if inner.is_empty() {
return Some(FailedRenderReason::EmptyNextRoot);
}
}
}
}
None
}
pub fn looks_like_loading_placeholder(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let body_stripped = body_html_without_scripts_lower(&lower);
let body_text = visible_text_from_stripped_html(&body_stripped);
let body_text_len = body_text.chars().filter(|c| !c.is_whitespace()).count();
if body_text_len == 0 {
return true;
}
if body_text_len < 400 {
let loading_markers = [
"loading...",
"loading…",
"please wait",
"just a moment",
"initializing",
"preparing",
"one moment",
];
if loading_markers.iter().any(|m| body_text.contains(m)) {
return true;
}
}
if body_text_len < 200 {
let spinner_markers = [
"class=\"spinner",
"class=\"loader",
"class=\"loading",
"class=\"preloader",
"id=\"loader",
"id=\"preloader",
"aria-label=\"loading\"",
];
if spinner_markers.iter().any(|m| body_stripped.contains(m)) {
return true;
}
}
false
}
fn body_html_without_scripts_lower(lower: &str) -> String {
let body_start = lower
.find("<body")
.and_then(|i| lower[i..].find('>').map(|j| i + j + 1));
let body_end = lower.rfind("</body>");
let body = match (body_start, body_end) {
(Some(start), Some(end)) if start < end => &lower[start..end],
_ => return String::new(),
};
let stripped = strip_tag_blocks(body, "script");
strip_tag_blocks(&stripped, "style")
}
fn visible_text_from_stripped_html(stripped: &str) -> String {
let mut text = String::with_capacity(stripped.len());
let mut in_tag = false;
let mut prev_ws = true;
for ch in stripped.chars() {
if ch == '<' {
in_tag = true;
} else if ch == '>' {
in_tag = false;
} else if !in_tag {
if ch.is_whitespace() {
if !prev_ws {
text.push(' ');
prev_ws = true;
}
} else {
text.push(ch);
prev_ws = false;
}
}
}
text
}
fn extract_body_text_len(lower: &str) -> usize {
if !lower.contains("<body") {
return 1000;
}
let stripped = body_html_without_scripts_lower(lower);
visible_text_from_stripped_html(&stripped)
.chars()
.filter(|c| !c.is_whitespace())
.count()
}
fn strip_tag_blocks(html: &str, tag: &str) -> String {
let mut result = String::with_capacity(html.len());
let open = format!("<{}", tag);
let close = format!("</{}>", tag);
let mut remaining = html;
while let Some(start) = remaining.find(&open) {
result.push_str(&remaining[..start]);
let after_open = &remaining[start..];
if let Some(end) = after_open.find(&close) {
remaining = &after_open[end + close.len()..];
} else {
remaining = "";
break;
}
}
result.push_str(remaining);
result
}
pub fn looks_like_cloudflare_challenge(html: &str) -> bool {
if html.len() > 80_000 {
return false;
}
let lower = html.to_lowercase();
let strong = [
"cf-browser-verification",
"cf-challenge-running",
"/cdn-cgi/challenge-platform/",
"_cf_chl_opt",
"__cf_chl_managed_tk__",
"window._cf_chl_opt",
];
if strong.iter().any(|m| lower.contains(m)) {
return true;
}
let weak = [
"just a moment",
"checking your browser",
"attention required",
"performance & security by cloudflare",
"performance & security by cloudflare",
];
let weak_hits = weak.iter().filter(|m| lower.contains(*m)).count();
weak_hits >= 2
}
pub fn is_cloudflare_mitigated_header(header_value: &str) -> bool {
let lower = header_value.trim().to_ascii_lowercase();
matches!(lower.as_str(), "challenge" | "block")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_spa_shell() {
let html = r#"<html><head></head><body><div id="root"></div><script src="/app.js"></script></body></html>"#;
assert!(needs_js_rendering(html));
}
#[test]
fn static_page_no_js_needed() {
let html = r#"<html><body><article><h1>Hello World</h1><p>This is a long article with plenty of text content to read and enjoy. It has multiple paragraphs and lots of useful information.</p></article></body></html>"#;
assert!(!needs_js_rendering(html));
}
#[test]
fn detects_loading_placeholder_text() {
let html =
r#"<html><body><div><p>Loading...</p><p>Hi! Ask me anything.</p></div></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn detects_spinner_only_body() {
let html = r#"<html><body><div class="spinner"></div></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn real_content_not_placeholder() {
let html = r#"<html><body><article><h1>Welcome to my creative space</h1><p>Waqar Bin Abrar is a full stack developer specializing in MERN stack and Flutter apps, building scalable digital solutions for clients worldwide.</p><p>With years of experience delivering production applications, he combines technical expertise with design sensibility.</p></article></body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
#[test]
fn logo_alt_loading_on_real_page_not_placeholder() {
let html = r#"<html><body>
<header><img alt="Loading..." src="/logo.png"/></header>
<article>
<h1>Software Engineering Blog</h1>
<p>Thoughts on distributed systems, programming languages, and the craft of writing software that lasts. New posts weekly.</p>
<p>This site covers topics from Rust ownership to Kubernetes operators.</p>
</article>
</body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
#[test]
fn empty_body_is_placeholder() {
let html = r#"<html><body></body></html>"#;
assert!(looks_like_loading_placeholder(html));
}
#[test]
fn large_page_never_placeholder() {
let filler = "x".repeat(100_000);
let html = format!("<html><body><p>Loading...</p>{filler}</body></html>");
assert!(!looks_like_loading_placeholder(&html));
}
#[test]
fn detects_nextjs_app_router_error_boundary() {
let html = r#"<html><body><div id="__next-error-0"><h2>Application error: a client-side exception has occurred.</h2></div></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::NextJsClientError)
);
}
#[test]
fn detects_nextjs_pages_router_error() {
let html = r#"<html><body><div id="__next_error__">oops</div></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::NextJsClientError)
);
}
#[test]
fn detects_react_minified_error() {
let html = r#"<html><body><a href="https://react.dev/errors/418">Minified React error #418</a></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::ReactMinifiedError)
);
}
#[test]
fn detects_legacy_react_error_decoder_url() {
let html = r#"<html><body><a href="https://reactjs.org/docs/error-decoder.html?invariant=31">React</a></body></html>"#;
assert_eq!(
looks_like_failed_render(html),
Some(FailedRenderReason::ReactMinifiedError)
);
}
#[test]
fn blog_post_about_error_is_not_failed_render() {
let html = r#"<html><body><article><h1>Debugging Next.js</h1>
<p>When you see "Application error: a client-side exception has occurred",
it usually means a hydration mismatch.</p>
<pre><code>console.log('debug')</code></pre>
</article></body></html>"#;
assert!(looks_like_failed_render(html).is_none());
}
#[test]
fn healthy_page_is_not_failed_render() {
let html =
r#"<html><body><main><h1>Hello</h1><p>Real content here.</p></main></body></html>"#;
assert!(looks_like_failed_render(html).is_none());
}
#[test]
fn huge_page_is_not_scanned() {
let mut html = String::from(r#"<html><body><div id="__next-error-0"></div>"#);
html.push_str(&"<p>filler</p>".repeat(20_000));
html.push_str("</body></html>");
assert!(html.len() > 200_000);
assert!(looks_like_failed_render(&html).is_none());
}
#[test]
fn cf_strong_marker_detected() {
let html =
r#"<html><body><div id="cf-browser-verification">Just a moment...</div></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_managed_token_detected() {
let html = r#"<html><body><script>window._cf_chl_opt={cvId:'2'};</script></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_single_weak_marker_not_enough() {
let html = r#"<html><body><article><h1>Why we use Cloudflare</h1><p>Performance benefits.</p></article></body></html>"#;
assert!(!looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_two_weak_markers_trigger() {
let html =
r#"<html><body><h1>Just a moment...</h1><p>Checking your browser...</p></body></html>"#;
assert!(looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_ray_id_alone_does_not_trigger() {
let html = r#"<html><body><h1>About</h1><p>Hosted via Cloudflare.</p><footer>Ray ID: abc123</footer></body></html>"#;
assert!(!looks_like_cloudflare_challenge(html));
}
#[test]
fn cf_mitigated_header_challenge() {
assert!(is_cloudflare_mitigated_header("challenge"));
assert!(is_cloudflare_mitigated_header(" CHALLENGE "));
assert!(is_cloudflare_mitigated_header("block"));
}
#[test]
fn cf_mitigated_header_other_values() {
assert!(!is_cloudflare_mitigated_header(""));
assert!(!is_cloudflare_mitigated_header("ok"));
assert!(!is_cloudflare_mitigated_header("verified"));
}
#[test]
fn cf_huge_page_not_scanned() {
let mut html = String::from(r#"<html><body><div id="cf-browser-verification">"#);
html.push_str(&"<p>x</p>".repeat(20_000));
html.push_str("</div></body></html>");
assert!(html.len() > 80_000);
assert!(!looks_like_cloudflare_challenge(&html));
}
#[test]
fn cloudfront_403_block_page_is_bot_wall() {
let html = r#"<html><head><title>ERROR: The request could not be satisfied</title></head>
<body><h1>403 ERROR</h1>
<h3>The request could not be satisfied.</h3>
<p>The Amazon CloudFront distribution is configured to block access from your country.</p>
<hr><i>Generated by cloudfront (CloudFront)</i></body></html>"#;
assert!(looks_like_generic_bot_wall(html));
}
#[test]
fn generic_403_with_block_phrasing_is_bot_wall() {
let html = r#"<html><body><h1>403</h1>
<p>Our firewall is configured to block access from this region.</p></body></html>"#;
assert!(looks_like_generic_bot_wall(html));
}
#[test]
fn legitimate_blog_about_cloudfront_is_not_bot_wall() {
let mut html = String::from(r#"<html><body><article>"#);
html.push_str("<p>An article about CloudFront and how distributions are configured to block access by country. </p>".repeat(20).as_str());
html.push_str("</article></body></html>");
assert!(!looks_like_generic_bot_wall(&html));
}
#[test]
fn vendor_cloudflare_challenge_form_detected() {
let html = r#"<html><body><form class="challenge-form" action="/?__cf_chl_f_tk=abc123">
</form></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("cloudflare"));
}
#[test]
fn vendor_cloudflare_error_code_detected() {
let html = r#"<html><body><span class="cf-error-code">1020</span></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("cloudflare"));
}
#[test]
fn vendor_cloudflare_challenge_platform_detected() {
let html = r#"<html><head><script src="/cdn-cgi/challenge-platform/h/g/orchestrate/chl_page/v1?ray=abc"></script></head></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("cloudflare"));
}
#[test]
fn vendor_akamai_reference_id_detected() {
let html = r#"<html><body><p>Access Denied</p>
<p>Reference #18.2d351ab8.1557333295.a4e16ab</p></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("akamai"));
}
#[test]
fn vendor_akamai_pardon_our_interruption_detected() {
let html = r#"<html><body><h1>Pardon Our Interruption</h1>
<p>As you were browsing, something about your browser made us think you were a bot.</p>
</body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("akamai"));
}
#[test]
fn vendor_perimeterx_pxappid_detected() {
let html = r#"<html><head><script>window._pxAppId = 'PXabc123';</script></head></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("perimeterx"));
}
#[test]
fn vendor_datadome_captcha_domain_detected() {
let html = r#"<html><body><iframe src="https://geo.captcha-delivery.com/captcha/?initialCid=xyz"></iframe></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("datadome"));
}
#[test]
fn vendor_imperva_incapsula_resource_detected() {
let html = r#"<html><body><script src="/_Incapsula_Resource?SWJIYLWA=blah"></script></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("imperva"));
}
#[test]
fn vendor_sucuri_firewall_brand_detected() {
let html = r#"<html><body><h1>Sucuri WebSite Firewall - Access Denied</h1></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("sucuri"));
}
#[test]
fn vendor_cloudfront_block_detected() {
let html = r#"<html><head><title>ERROR: The request could not be satisfied</title></head>
<body><h1>403 ERROR</h1>
<hr><i>Generated by cloudfront (CloudFront)</i></body></html>"#;
assert_eq!(looks_like_vendor_block(html), Some("cloudfront"));
}
#[test]
fn vendor_legit_blog_about_cloudflare_is_none() {
let mut html = String::from("<html><body><article><h1>Why we picked Cloudflare</h1>");
html.push_str(
&"<p>Cloudflare gives us DDoS protection and a global anycast network.</p>".repeat(400),
);
html.push_str("</article></body></html>");
assert!(html.len() > 15_000);
assert!(looks_like_vendor_block(&html).is_none());
}
#[test]
fn vendor_block_oversized_page_returns_none() {
let big = "x".repeat(300_000);
assert!(looks_like_vendor_block(&big).is_none());
}
#[test]
fn vendor_block_clean_page_returns_none() {
let html = r#"<html><body><main><h1>Hello</h1><p>Real content.</p></main></body></html>"#;
assert!(looks_like_vendor_block(html).is_none());
}
#[test]
fn spinner_class_in_script_body_ignored() {
let html = r#"<html><body><article><h1>Real Article</h1><p>This is a real article with substantial content about the topic at hand, providing useful information.</p><script>const x = 'class="spinner"';</script></article></body></html>"#;
assert!(!looks_like_loading_placeholder(html));
}
}