use super::quality::QualityScore;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AuthRequiredKind {
LoginRequired,
SessionExpired,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BrowserChallengeKind {
Cloudflare,
Vercel,
Turnstile,
Captcha,
LinkedInBotDetection,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResponseDiagnosticKind {
AuthRequired(AuthRequiredKind),
BrowserChallenge(BrowserChallengeKind),
RateLimited,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ResponseDiagnostic {
pub kind: ResponseDiagnosticKind,
pub status: u16,
}
impl ResponseDiagnostic {
#[must_use]
pub fn code(self) -> &'static str {
match self.kind {
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
"login_required"
}
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
"session_expired"
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
"cloudflare_challenge"
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
"vercel_challenge"
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
"turnstile_challenge"
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
"captcha_challenge"
}
ResponseDiagnosticKind::BrowserChallenge(
BrowserChallengeKind::LinkedInBotDetection,
) => "linkedin_bot_detection",
ResponseDiagnosticKind::RateLimited => "rate_limited",
}
}
#[must_use]
pub fn summary(self) -> String {
match self.kind {
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => format!(
"Login wall or authenticated content detected (HTTP {}).",
self.status
),
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => format!(
"Session appears expired or timed out (HTTP {}).",
self.status
),
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare) => {
format!(
"Cloudflare browser challenge detected (HTTP {}).",
self.status
)
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel) => {
"Vercel Security Checkpoint detected.".to_string()
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile) => {
"Cloudflare Turnstile challenge detected.".to_string()
}
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha) => {
format!("CAPTCHA challenge detected (HTTP {}).", self.status)
}
ResponseDiagnosticKind::BrowserChallenge(
BrowserChallengeKind::LinkedInBotDetection,
) => "LinkedIn bot detection (HTTP 999).".to_string(),
ResponseDiagnosticKind::RateLimited => format!(
"Rate limit or throttling response detected (HTTP {}).",
self.status
),
}
}
#[must_use]
pub fn guidance(self) -> &'static str {
match self.kind {
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => {
"Sign in in a browser first, then retry with the default browser cookies or a named authenticated session. If you explicitly disabled cookies, re-enable them."
}
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => {
"Refresh the site in a browser to renew the session, then retry with the default browser cookies or a named authenticated session."
}
ResponseDiagnosticKind::BrowserChallenge(_) => {
"Complete the browser challenge in a real browser first, then retry with the default browser cookies or a named session. Use an explicit browser override only if the default profile is not the authenticated one."
}
ResponseDiagnosticKind::RateLimited => {
"Retry later, or use an authenticated browser/session path if the site rate-limits anonymous traffic."
}
}
}
#[must_use]
pub fn message(self) -> String {
format!("{}\n{}", self.summary(), self.guidance())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ResponseClass {
Unauthorized,
LoginRequired,
Forbidden,
BotChallenge,
RateLimited,
ObfuscatedContent,
ThinContent,
}
impl ResponseClass {
#[must_use]
pub fn code(self) -> &'static str {
match self {
Self::Unauthorized => "unauthorized",
Self::LoginRequired => "login_required",
Self::Forbidden => "forbidden",
Self::BotChallenge => "bot_challenge",
Self::RateLimited => "rate_limited",
Self::ObfuscatedContent => "obfuscated_content",
Self::ThinContent => "thin_content",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct ResponseSignal {
pub class: ResponseClass,
pub confidence: f32,
pub reason: &'static str,
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct ResponseClassification {
signals: Vec<ResponseSignal>,
}
impl ResponseClassification {
fn push(&mut self, signal: ResponseSignal) {
if !self.has_class(signal.class) {
self.signals.push(signal);
}
}
#[must_use]
pub fn primary(&self) -> Option<&ResponseSignal> {
self.signals.first()
}
#[must_use]
pub fn has_class(&self, class: ResponseClass) -> bool {
self.signals.iter().any(|signal| signal.class == class)
}
}
#[derive(Debug, Clone, Copy)]
pub struct ResponseAnalysis<'a> {
pub status: u16,
pub body: &'a str,
pub content_type: Option<&'a str>,
pub html_bytes: Option<usize>,
pub markdown: Option<&'a str>,
pub markdown_chars: Option<usize>,
pub quality: Option<&'a QualityScore>,
}
#[must_use]
pub fn classify_http_response(status: u16, body: &str) -> Option<ResponseDiagnostic> {
let body_lower = body.to_lowercase();
classify_http_response_lower(status, &body_lower)
}
#[must_use]
pub fn classify_response(analysis: ResponseAnalysis<'_>) -> ResponseClassification {
let body_lower = analysis.body.to_lowercase();
let mut classification = ResponseClassification::default();
if analysis.status == 401 {
classification.push(ResponseSignal {
class: ResponseClass::Unauthorized,
confidence: 0.97,
reason: "http 401 unauthorized response",
});
} else if let Some(diagnostic) = classify_http_response_lower(analysis.status, &body_lower) {
classification.push(map_diagnostic_signal(diagnostic));
} else if matches!(analysis.status, 403 | 999) && looks_like_forbidden(&body_lower) {
classification.push(ResponseSignal {
class: ResponseClass::Forbidden,
confidence: if analysis.status == 999 { 0.96 } else { 0.85 },
reason: if analysis.status == 999 {
"nonstandard anti-automation block status detected"
} else {
"forbidden or access-denied markers detected"
},
});
}
if let (Some(html_bytes), Some(markdown_chars)) = (analysis.html_bytes, analysis.markdown_chars)
&& classify_thin_content(
analysis.content_type,
html_bytes,
markdown_chars,
analysis.quality,
)
.is_some()
{
let confidence = analysis.quality.map_or(0.78_f32, |quality| {
if quality.confidence < 0.5 {
0.9_f32
} else {
0.8_f32
}
});
classification.push(ResponseSignal {
class: ResponseClass::ThinContent,
confidence,
reason: "markdown output is disproportionately small relative to the HTML body",
});
}
if let Some(markdown) = analysis.markdown
&& classify_obfuscated_content(analysis.content_type, markdown).is_some()
{
classification.push(ResponseSignal {
class: ResponseClass::ObfuscatedContent,
confidence: 0.95,
reason: "extracted content is dominated by a long encoded or obfuscated blob",
});
}
classification
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ThinContentDiagnostic {
pub html_bytes: usize,
pub markdown_chars: usize,
pub low_confidence: bool,
}
#[must_use]
pub fn classify_thin_content(
content_type: Option<&str>,
html_bytes: usize,
markdown_chars: usize,
quality: Option<&QualityScore>,
) -> Option<ThinContentDiagnostic> {
let is_html = content_type.is_some_and(|value| value.contains("html"));
if !is_html {
return None;
}
if is_thin_content(html_bytes, markdown_chars) {
return Some(ThinContentDiagnostic {
html_bytes,
markdown_chars,
low_confidence: quality.is_some_and(|score| score.confidence < 0.5),
});
}
if html_bytes >= 5_000
&& markdown_chars < 800
&& quality.is_some_and(|score| score.confidence < 0.35)
{
return Some(ThinContentDiagnostic {
html_bytes,
markdown_chars,
low_confidence: true,
});
}
None
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ObfuscatedContentDiagnostic {
pub dominant_blob_chars: usize,
pub non_whitespace_chars: usize,
pub readable_word_count: usize,
}
#[must_use]
pub fn classify_obfuscated_content(
content_type: Option<&str>,
markdown: &str,
) -> Option<ObfuscatedContentDiagnostic> {
let is_html = content_type.is_some_and(|value| value.contains("html"));
if !is_html {
return None;
}
let non_whitespace_chars = markdown.chars().filter(|c| !c.is_whitespace()).count();
if non_whitespace_chars < 2_048 {
return None;
}
let readable_word_count = markdown
.split_whitespace()
.filter(|token| looks_like_readable_word(token))
.take(32)
.count();
if readable_word_count >= 24 {
return None;
}
let dominant_blob_chars = markdown
.split_whitespace()
.filter_map(base64ish_blob_token_len)
.max()
.unwrap_or(0);
let dominant_ratio = (dominant_blob_chars * 100) / non_whitespace_chars.max(1);
if dominant_blob_chars >= 2_048 || (dominant_blob_chars >= 1_024 && dominant_ratio >= 60) {
return Some(ObfuscatedContentDiagnostic {
dominant_blob_chars,
non_whitespace_chars,
readable_word_count,
});
}
None
}
fn classify_http_response_lower(status: u16, body_lower: &str) -> Option<ResponseDiagnostic> {
if status == 999 {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::BrowserChallenge(
BrowserChallengeKind::LinkedInBotDetection,
),
status,
});
}
if looks_like_turnstile(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile),
status,
});
}
if status == 429 && looks_like_vercel_checkpoint(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel),
status,
});
}
if matches!(status, 403 | 503) && looks_like_cloudflare_challenge(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare),
status,
});
}
if matches!(status, 403 | 429 | 503) && looks_like_captcha_interstitial(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Captcha),
status,
});
}
if matches!(status, 419 | 440) || looks_like_session_expired(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired),
status,
});
}
if status == 429 && looks_like_rate_limit(body_lower) {
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::RateLimited,
status,
});
}
if (status == 403
&& (looks_like_login_wall(body_lower) || looks_like_password_gate(body_lower)))
|| (looks_like_login_wall(body_lower) && looks_like_password_gate(body_lower))
{
return Some(ResponseDiagnostic {
kind: ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired),
status,
});
}
None
}
fn map_diagnostic_signal(diagnostic: ResponseDiagnostic) -> ResponseSignal {
match diagnostic.kind {
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired) => ResponseSignal {
class: ResponseClass::LoginRequired,
confidence: if diagnostic.status == 200 { 0.83 } else { 0.95 },
reason: "login-wall markers and password-gate signals detected",
},
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired) => ResponseSignal {
class: ResponseClass::Unauthorized,
confidence: 0.94,
reason: "session-expired markers detected",
},
ResponseDiagnosticKind::BrowserChallenge(_) => ResponseSignal {
class: ResponseClass::BotChallenge,
confidence: 0.97,
reason: "browser-challenge or CAPTCHA markers detected",
},
ResponseDiagnosticKind::RateLimited => ResponseSignal {
class: ResponseClass::RateLimited,
confidence: 0.91,
reason: "rate-limit markers detected",
},
}
}
fn looks_like_vercel_checkpoint(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"vercel security checkpoint",
"we're verifying your browser",
"we are verifying your browser",
],
)
}
fn looks_like_cloudflare_challenge(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"cf-browser-verification",
"cf-chl-",
"cf-challenge",
"checking your browser before accessing",
"just a moment...",
"cloudflare ray id",
],
)
}
fn looks_like_turnstile(body_lower: &str) -> bool {
contains_any(
body_lower,
&["cf-turnstile", "turnstile.js", "challenge-platform"],
)
}
fn looks_like_captcha(body_lower: &str) -> bool {
contains_any(
body_lower,
&["g-recaptcha", "grecaptcha", "h-captcha", "hcaptcha"],
) || (body_lower.contains("captcha") && body_lower.contains("<img"))
}
fn looks_like_captcha_interstitial(body_lower: &str) -> bool {
looks_like_captcha(body_lower)
&& contains_any(
body_lower,
&[
"verify you are human",
"are you human",
"security check",
"browser verification",
"checking your browser",
"please enable javascript and cookies to continue",
],
)
}
fn looks_like_rate_limit(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"too many requests",
"rate limit",
"rate-limit",
"throttled",
"request limit reached",
],
)
}
fn looks_like_forbidden(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"access denied",
"forbidden",
"permission denied",
"not authorized",
"not authorised",
],
)
}
fn looks_like_session_expired(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"session expired",
"your session has expired",
"session timed out",
"please sign in again",
"please log in again",
],
)
}
fn looks_like_login_wall(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"login required",
"log in to continue",
"sign in to continue",
"authentication required",
"please authenticate",
"continue with google",
"continue with email",
"sign in with",
],
)
}
fn looks_like_password_gate(body_lower: &str) -> bool {
contains_any(
body_lower,
&[
"type=\"password\"",
"autocomplete=\"current-password\"",
"name=\"password\"",
"id=\"password\"",
"enter your password",
"forgot password",
],
)
}
fn contains_any(haystack: &str, needles: &[&str]) -> bool {
needles.iter().any(|needle| haystack.contains(needle))
}
fn looks_like_readable_word(token: &str) -> bool {
let len = token.chars().count();
if !(4..=24).contains(&len) {
return false;
}
let alpha_count = token.chars().filter(char::is_ascii_alphabetic).count();
alpha_count * 100 / len >= 80
}
fn base64ish_blob_token_len(token: &str) -> Option<usize> {
let len = token.len();
if len < 768 {
return None;
}
let allowed_count = token
.bytes()
.filter(|byte| {
byte.is_ascii_alphanumeric() || matches!(*byte, b'+' | b'/' | b'=' | b'_' | b'-')
})
.count();
if allowed_count * 100 / len < 98 {
return None;
}
let digit_count = token.bytes().filter(u8::is_ascii_digit).count();
let alpha_count = token.bytes().filter(u8::is_ascii_alphabetic).count();
if digit_count == 0 || alpha_count == 0 {
return None;
}
Some(len)
}
fn is_thin_content(html_len: usize, markdown_len: usize) -> bool {
const MIN_HTML_LEN: usize = 5_000;
const MIN_MARKDOWN_LEN: usize = 800;
const THIN_RATIO_PERCENT: usize = 2;
if html_len < MIN_HTML_LEN || markdown_len >= MIN_MARKDOWN_LEN {
return false;
}
let ratio_percent = (markdown_len * 100) / html_len.max(1);
ratio_percent < THIN_RATIO_PERCENT
}
#[cfg(test)]
mod tests {
use super::{
AuthRequiredKind, BrowserChallengeKind, ResponseAnalysis, ResponseClass,
ResponseDiagnosticKind, classify_http_response, classify_obfuscated_content,
classify_response,
};
#[test]
fn classify_http_response_detects_vercel_checkpoint() {
let body = "<html><body>Vercel Security Checkpoint</body></html>";
let diagnostic = classify_http_response(429, body).expect("vercel classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Vercel)
);
assert_eq!(diagnostic.code(), "vercel_challenge");
}
#[test]
fn classify_http_response_detects_cloudflare_challenge() {
let body = "<div id='cf-browser-verification'>Please wait...</div>";
let diagnostic = classify_http_response(403, body).expect("cloudflare classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Cloudflare)
);
}
#[test]
fn classify_http_response_detects_turnstile_challenge_on_200() {
let body = "<div class='cf-turnstile'></div><script src='turnstile.js'></script>";
let diagnostic = classify_http_response(200, body).expect("turnstile classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::BrowserChallenge(BrowserChallengeKind::Turnstile)
);
}
#[test]
fn classify_http_response_detects_login_wall_with_password_form() {
let body = r#"
<html><body>
<h1>Sign in to continue</h1>
<form><input type="password" name="password"></form>
</body></html>
"#;
let diagnostic = classify_http_response(200, body).expect("login wall classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
);
assert_eq!(diagnostic.code(), "login_required");
}
#[test]
fn classify_http_response_detects_session_expired() {
let body = "<html><body>Your session has expired. Please sign in again.</body></html>";
let diagnostic = classify_http_response(200, body).expect("session expired classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::SessionExpired)
);
}
#[test]
fn classify_http_response_detects_rate_limit() {
let body = "Too many requests. Rate limit reached.";
let diagnostic = classify_http_response(429, body).expect("rate limit classification");
assert_eq!(diagnostic.kind, ResponseDiagnosticKind::RateLimited);
}
#[test]
fn classify_http_response_ignores_normal_html() {
let body = "<html><body><article><h1>Hello</h1><p>World</p></article></body></html>";
assert!(
classify_http_response(200, body).is_none(),
"expected no diagnostic for regular article HTML"
);
}
#[test]
fn classify_response_marks_thin_html_content() {
let classification = classify_response(ResponseAnalysis {
status: 200,
body: "<html></html>",
content_type: Some("text/html"),
html_bytes: Some(20_000),
markdown: Some("short"),
markdown_chars: Some(120),
quality: None,
});
assert!(classification.has_class(ResponseClass::ThinContent));
}
#[test]
fn classify_response_maps_session_expired_to_unauthorized() {
let classification = classify_response(ResponseAnalysis {
status: 200,
body: "<html><body>Your session has expired. Please sign in again.</body></html>",
content_type: Some("text/html"),
html_bytes: None,
markdown: None,
markdown_chars: None,
quality: None,
});
assert_eq!(
classification.primary().map(|signal| signal.class),
Some(ResponseClass::Unauthorized)
);
}
#[test]
fn classify_response_maps_http_401_to_unauthorized() {
let classification = classify_response(ResponseAnalysis {
status: 401,
body: "<html><body>Unauthorized</body></html>",
content_type: Some("text/html"),
html_bytes: None,
markdown: None,
markdown_chars: None,
quality: None,
});
assert_eq!(
classification.primary().map(|signal| signal.class),
Some(ResponseClass::Unauthorized)
);
}
#[test]
fn classify_response_detects_obfuscated_content_blob() {
let blob = format!("Title: Protected article\n\n{}", "AbC123+/".repeat(700));
let classification = classify_response(ResponseAnalysis {
status: 200,
body: "<html><body><script>protected payload</script></body></html>",
content_type: Some("text/html"),
html_bytes: Some(40_000),
markdown: Some(&blob),
markdown_chars: Some(blob.len()),
quality: None,
});
assert_eq!(
classification.primary().map(|signal| signal.class),
Some(ResponseClass::ObfuscatedContent)
);
assert!(
classify_obfuscated_content(Some("text/html"), &blob).is_some(),
"expected blob classification for encoded markdown"
);
}
#[test]
fn classify_obfuscated_content_ignores_readable_article_with_one_blob() {
let article = [
"This article explains a benchmark result in normal prose.",
"It includes enough readable words to look like a real article body.",
"A single pasted token should not dominate the classification.",
&"AbC123+/".repeat(180),
]
.join(" ");
assert!(
classify_obfuscated_content(Some("text/html"), &article).is_none(),
"expected readable article to avoid obfuscated classification"
);
}
#[test]
fn classify_http_response_avoids_login_page_recaptcha_false_positive() {
let body = r#"
<html><body>
<h1>Sign in to continue</h1>
<form><input type="password" name="password"></form>
<div class="g-recaptcha"></div>
</body></html>
"#;
let diagnostic = classify_http_response(200, body).expect("login wall classification");
assert_eq!(
diagnostic.kind,
ResponseDiagnosticKind::AuthRequired(AuthRequiredKind::LoginRequired)
);
}
}