use std::sync::OnceLock;
use regex::{RegexSet, RegexSetBuilder};
use serde::{Deserialize, Serialize};
pub const CORPUS_VERSION: &str = "cl-2026.06.01";
pub const BODY_PREFIX_LIMIT: usize = 64 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum Evidence {
None,
Headers,
Body,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct AntibotVerdict {
pub vendors: Vec<String>,
pub challenged: bool,
pub challenge_vendor: Option<String>,
pub corpus_version: &'static str,
pub evidence: Evidence,
}
impl AntibotVerdict {
fn empty() -> Self {
Self {
vendors: Vec::new(),
challenged: false,
challenge_vendor: None,
corpus_version: CORPUS_VERSION,
evidence: Evidence::None,
}
}
pub fn detected(&self) -> bool {
!self.vendors.is_empty()
}
}
#[derive(Debug, Deserialize)]
struct VendorSig {
vendor: String,
signals: Vec<String>,
#[serde(default)]
challenge: Vec<String>,
}
struct Compiled {
set: RegexSet,
meta: Vec<SignalMeta>,
}
struct SignalMeta {
vendor: String,
challenge: bool,
}
fn compiled() -> &'static Compiled {
static COMPILED: OnceLock<Compiled> = OnceLock::new();
COMPILED.get_or_init(|| {
let vendors: Vec<VendorSig> = serde_json::from_str(CORPUS_JSON).unwrap_or_default();
let mut patterns: Vec<String> = Vec::new();
let mut meta: Vec<SignalMeta> = Vec::new();
for v in vendors {
for p in v.signals {
patterns.push(section_anchor(&p));
meta.push(SignalMeta { vendor: v.vendor.clone(), challenge: false });
}
for p in v.challenge {
patterns.push(section_anchor(&p));
meta.push(SignalMeta { vendor: v.vendor.clone(), challenge: true });
}
}
let set = RegexSetBuilder::new(&patterns)
.case_insensitive(true)
.size_limit(8 * 1024 * 1024)
.build()
.unwrap_or_else(|_| RegexSet::empty());
Compiled { set, meta }
})
}
fn section_anchor(pattern: &str) -> String {
match pattern.strip_prefix("b:") {
Some(rest) => format!("b:.*{rest}"),
None => pattern.to_string(),
}
}
fn flatten(s: &str) -> String {
s.replace(['\n', '\r'], " ")
}
fn normalize_head(status: u16, headers: &[(String, String)]) -> String {
let mut out = String::with_capacity(64 + headers.len() * 48);
out.push_str("S:");
out.push_str(&status.to_string());
for (name, value) in headers {
out.push_str("\nH:");
out.push_str(&flatten(&name.to_lowercase()));
out.push_str(": ");
out.push_str(&flatten(&value.to_lowercase()));
}
out
}
fn scan(haystack: &str) -> (Vec<String>, Option<String>) {
let c = compiled();
let mut vendors: Vec<String> = Vec::new();
let mut challenge_vendor: Option<String> = None;
for idx in c.set.matches(haystack) {
let Some(m) = c.meta.get(idx) else { continue };
if !vendors.iter().any(|v| v == &m.vendor) {
vendors.push(m.vendor.clone());
}
if m.challenge && challenge_vendor.is_none() {
challenge_vendor = Some(m.vendor.clone());
}
}
vendors.sort();
(vendors, challenge_vendor)
}
pub fn classify(status: u16, headers: &[(String, String)], body: &str) -> AntibotVerdict {
let head = normalize_head(status, headers);
let (head_vendors, head_challenge) = scan(&head);
if head_challenge.is_some() {
return AntibotVerdict {
vendors: head_vendors,
challenged: true,
challenge_vendor: head_challenge,
corpus_version: CORPUS_VERSION,
evidence: Evidence::Headers,
};
}
let prefix_end =
body.char_indices().map(|(i, _)| i).nth(BODY_PREFIX_LIMIT).unwrap_or(body.len());
let mut full = head;
full.push_str("\nB:");
full.push_str(&flatten(&body[..prefix_end].to_lowercase()));
let (vendors, challenge_vendor) = scan(&full);
if vendors.is_empty() {
return AntibotVerdict::empty();
}
let evidence = if vendors.len() > head_vendors.len() || challenge_vendor.is_some() {
Evidence::Body
} else {
Evidence::Headers
};
AntibotVerdict {
vendors,
challenged: challenge_vendor.is_some(),
challenge_vendor,
corpus_version: CORPUS_VERSION,
evidence,
}
}
const CORPUS_JSON: &str = r#"
[
{
"vendor": "cloudflare",
"signals": ["h:server: cloudflare", "h:cf-ray:", "b:cdn-cgi/"],
"challenge": [
"h:cf-mitigated: challenge",
"b:just a moment\\.\\.\\.",
"b:challenges\\.cloudflare\\.com/turnstile",
"b:cf-turnstile",
"b:/cdn-cgi/challenge-platform",
"b:cf_chl_"
]
},
{
"vendor": "datadome",
"signals": ["h:x-datadome", "h:set-cookie: datadome=", "b:datadome"],
"challenge": ["b:geo\\.captcha-delivery\\.com", "b:captcha-delivery\\.com", "h:x-dd-b:"]
},
{
"vendor": "akamai",
"signals": ["h:server: akamaighost", "h:x-akamai-transformed", "b:ak_bmsc", "b:_abck"],
"challenge": ["b:reference #[0-9a-f]{2}\\.", "b:errors\\.edgesuite\\.net"]
},
{
"vendor": "imperva",
"signals": ["h:x-iinfo", "h:set-cookie: visid_incap", "h:x-cdn: incapsula"],
"challenge": ["b:_incapsula_resource", "b:incident id"]
},
{
"vendor": "perimeterx",
"signals": ["h:set-cookie: _px", "b:window\\._pxappid", "b:px-cdn"],
"challenge": ["b:px-captcha", "b:/px/captcha", "b:perimeterx"]
},
{
"vendor": "kasada",
"signals": ["h:x-kpsdk-ct", "h:x-kpsdk-cd", "b:kpsdk"],
"challenge": ["b:/_kpsdk", "b:ips\\.js"]
},
{
"vendor": "awswaf",
"signals": ["h:x-amzn-waf-action", "b:awswaf"],
"challenge": ["b:token\\.awswaf", "b:challenge\\.compact"]
},
{
"vendor": "f5",
"signals": ["h:set-cookie: bigipserver", "h:set-cookie: ts[0-9a-f]{6}", "h:server: big-?ip"],
"challenge": ["b:the requested url was rejected", "b:support id is"]
},
{
"vendor": "sucuri",
"signals": ["h:server: sucuri", "h:x-sucuri-id"],
"challenge": ["h:x-sucuri-block", "b:sucuri website firewall"]
},
{
"vendor": "cloudfront",
"signals": ["h:x-amz-cf-id", "h:via:.*cloudfront"],
"challenge": ["b:generated by cloudfront"]
},
{
"vendor": "recaptcha",
"signals": [],
"challenge": ["b:www\\.google\\.com/recaptcha", "b:grecaptcha", "b:g-recaptcha"]
},
{
"vendor": "hcaptcha",
"signals": [],
"challenge": ["b:hcaptcha\\.com", "b:h-captcha"]
}
]
"#;
#[cfg(test)]
mod tests {
use super::*;
fn h(pairs: &[(&str, &str)]) -> Vec<(String, String)> {
pairs.iter().map(|(k, v)| ((*k).to_string(), (*v).to_string())).collect()
}
#[test]
fn corpus_compiles_nonempty() {
assert!(compiled().set.len() > 10, "corpus failed to compile");
}
#[test]
fn cloudflare_presence_only_is_not_challenged() {
let v = classify(
200,
&h(&[("server", "cloudflare"), ("cf-ray", "8a1b2c3d4e5f")]),
"<html>ok</html>",
);
assert!(v.vendors.contains(&"cloudflare".to_string()));
assert!(!v.challenged, "mere presence must not count as a challenge");
assert_eq!(v.evidence, Evidence::Headers);
assert!(v.challenge_vendor.is_none());
}
#[test]
fn cloudflare_turnstile_interstitial_is_challenged() {
let body = "<title>Just a moment...</title><script src=\"https://challenges.cloudflare.com/turnstile/v0/api.js\"></script>";
let v = classify(403, &h(&[("server", "cloudflare"), ("cf-mitigated", "challenge")]), body);
assert!(v.challenged);
assert_eq!(v.challenge_vendor.as_deref(), Some("cloudflare"));
assert_eq!(v.evidence, Evidence::Headers);
}
#[test]
fn datadome_block_via_header() {
let v = classify(
403,
&h(&[("x-datadome", "protected"), ("set-cookie", "datadome=abc; Path=/")]),
"blocked",
);
assert!(v.vendors.contains(&"datadome".to_string()));
}
#[test]
fn body_cloaked_recaptcha_uses_body_tier() {
let body = "<div class=\"g-recaptcha\" data-sitekey=\"x\"></div>";
let v = classify(200, &h(&[("content-type", "text/html")]), body);
assert!(v.vendors.contains(&"recaptcha".to_string()));
assert!(v.challenged);
assert_eq!(v.evidence, Evidence::Body);
}
#[test]
fn clean_page_detects_nothing() {
let v = classify(
200,
&h(&[("server", "nginx"), ("content-type", "text/html")]),
"<html><body>hello</body></html>",
);
assert!(!v.detected());
assert!(!v.challenged);
assert_eq!(v.evidence, Evidence::None);
assert_eq!(v.corpus_version, CORPUS_VERSION);
}
#[test]
fn body_prefix_is_bounded() {
let mut body = "x".repeat(BODY_PREFIX_LIMIT + 1024);
body.push_str("h-captcha");
let v = classify(200, &h(&[]), &body);
assert!(!v.detected(), "markers past BODY_PREFIX_LIMIT must not match");
}
}