const MIN_BODY_LEN: usize = 500;
const UNAUTHORIZED_PATTERNS: [&str; 4] = [
"access denied",
"permission denied",
"forbidden",
"unauthorized",
];
const SUSPICIOUS_PATTERNS: [&str; 12] = [
"verify you are a human",
"please complete the captcha",
"solve this captcha",
"captcha challenge",
"cf-browser-verification",
"cf-captcha-container",
"px-captcha",
"blocked by cloudflare",
"please enable javascript and cookies",
"suspicious activity",
"bot detection",
"perimeterx",
];
fn ensure_lower<'a>(body: &'a str, cache: &'a mut Option<String>) -> &'a str {
if let Some(ref lower) = cache {
lower.as_str()
} else {
*cache = Some(body.to_ascii_lowercase());
cache.as_ref().unwrap()
}
}
fn is_invalid_cached<'a>(body: &'a str, cache: &'a mut Option<String>) -> Option<&'static str> {
if body.len() < MIN_BODY_LEN {
return Some("body is too short");
}
let lower = ensure_lower(body, cache);
if !lower.contains("<html") && !lower.contains("<!doctype") {
return Some("missing HTML markers");
}
None
}
fn is_unauthorized_cached<'a>(
body: &'a str,
cache: &'a mut Option<String>,
) -> Option<&'static str> {
let lower = ensure_lower(body, cache);
UNAUTHORIZED_PATTERNS
.iter()
.copied()
.find(|pattern| lower.contains(pattern))
}
fn is_suspicious_cached<'a>(body: &'a str, cache: &'a mut Option<String>) -> Option<&'static str> {
let lower = ensure_lower(body, cache);
SUSPICIOUS_PATTERNS
.iter()
.copied()
.find(|pattern| lower.contains(pattern))
}
pub(super) fn validate_response(
status_code: reqwest::StatusCode,
body: &str,
) -> Result<(), String> {
if !status_code.is_success() {
if status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
return Err(format!("rate limited ({})", status_code.as_u16()));
}
if status_code == reqwest::StatusCode::FORBIDDEN {
return Err(format!("status {} (forbidden)", status_code.as_u16()));
}
if status_code == reqwest::StatusCode::NOT_FOUND {
return Err(format!("status {} (not found)", status_code.as_u16()));
}
if status_code == reqwest::StatusCode::UNAUTHORIZED {
return Err(format!("status {} (unauthorized)", status_code.as_u16()));
}
if status_code == reqwest::StatusCode::BAD_REQUEST {
return Err(format!("status {} (bad request)", status_code.as_u16()));
}
if status_code == reqwest::StatusCode::INTERNAL_SERVER_ERROR {
return Err(format!("status {} (server error)", status_code.as_u16()));
}
return Err(format!("status {} (unknown error)", status_code.as_u16()));
}
let mut body_lower_cache = None;
if body.contains("application/ld+json") {
if let Some(reason) = is_invalid_cached(body, &mut body_lower_cache) {
return Err(format!("invalid - {}", reason));
}
return Ok(());
}
if let Some(reason) = is_invalid_cached(body, &mut body_lower_cache) {
return Err(format!("invalid - {}", reason));
}
if let Some(pattern) = is_unauthorized_cached(body, &mut body_lower_cache) {
return Err(format!("unauthorized - {}", pattern));
}
if let Some(pattern) = is_suspicious_cached(body, &mut body_lower_cache) {
return Err(format!("suspicious - {}", pattern));
}
Ok(())
}