use std::fmt;
use std::num::NonZeroU32;
use std::sync::Arc;
use std::time::{Duration, Instant};
use reqwest::redirect;
use crate::ban;
use crate::browser::{BrowserBackend, BrowserBudget, RenderedPage};
use crate::check::{CheckOutcome, MatchKind, UncertainReason};
use crate::error::{Error, Result};
use crate::retry::{self, RetryPolicy};
use crate::robots::RobotsCache;
use crate::site::{Probe, Signal, SignalVerdict, Site, aggregate};
use crate::throttle::HostThrottle;
use crate::username::Username;
const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
const DEFAULT_REDIRECT_LIMIT: usize = 8;
const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
const GLOBAL_THROTTLE_KEY: &str = "*global*";
#[derive(Clone)]
pub struct Client {
inner: reqwest::Client,
throttle: HostThrottle,
global_throttle: Option<HostThrottle>,
retry: RetryPolicy,
user_agents: Arc<[String]>,
enrich: bool,
robots: Option<RobotsCache>,
browser: Option<Arc<dyn BrowserBackend>>,
browser_budget: Arc<BrowserBudget>,
}
impl Client {
pub fn builder() -> ClientBuilder {
ClientBuilder::default()
}
#[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
let mut attempt: u32 = 0;
loop {
let outcome = self.probe_once(site, username).await;
if !retry::should_retry(&outcome, attempt, &self.retry) {
return outcome;
}
let delay = retry::backoff_delay(attempt, &self.retry);
tracing::info!(
site = %site.name,
attempt = attempt + 1,
reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
?delay,
"transient ban, retrying",
);
tokio::time::sleep(delay).await;
attempt += 1;
}
}
pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
let host = host_of(url);
if let Some(global) = &self.global_throttle {
global.wait(GLOBAL_THROTTLE_KEY).await;
}
self.throttle.wait(&host).await;
let mut request = self.inner.get(url);
if let Some(ua) = self.pick_user_agent() {
request = request.header(reqwest::header::USER_AGENT, ua);
}
let response = request.send().await.ok()?;
let status = response.status().as_u16();
let final_url = response.url().to_string();
let body = response.text().await.unwrap_or_default();
Some(RawResponse {
status,
final_url,
body,
})
}
pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
if let Some(backend) = self.browser.as_deref() {
if site
.tags
.iter()
.any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
{
let parsed = url::Url::parse(url).ok()?;
match backend
.fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
.await
{
Ok(page) => {
return Some(RawResponse {
status: page.status,
final_url: page.final_url.to_string(),
body: page.body,
});
}
Err(err) => {
tracing::warn!(
site = %site.name, %url, error = %err,
"browser fetch failed in doctor; falling back to raw HTTP",
);
}
}
}
}
self.fetch(url).await
}
fn pick_user_agent(&self) -> Option<&str> {
match self.user_agents.len() {
0 => None,
1 => Some(&self.user_agents[0]),
n => Some(&self.user_agents[fastrand::usize(0..n)]),
}
}
#[allow(clippy::too_many_lines)]
async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
let url = site.url_for(username);
if let Some(pat) = &site.regex_check {
if let Ok(re) = regex::Regex::new(pat) {
if !re.is_match(username.as_str()) {
return uncertain(
&site.name,
url,
Instant::now(),
UncertainReason::UsernameNotAllowed,
);
}
}
}
if let Some(backend) = self.browser.as_deref() {
if site
.tags
.iter()
.any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
{
if self.browser_budget.try_consume() {
return self.probe_with_browser(site, &url, backend).await;
}
tracing::warn!(site = %site.name, "browser budget exhausted");
return uncertain(
&site.name,
url,
Instant::now(),
UncertainReason::BrowserBudget,
);
}
}
let host = host_of(&url);
if let Some(robots) = &self.robots {
if let Some((origin, path)) = origin_and_path(&url) {
if !robots.allowed(&origin, &path).await {
tracing::debug!(%url, "skipped by robots.txt");
return uncertain(
&site.name,
url,
Instant::now(),
UncertainReason::RobotsDisallowed,
);
}
}
}
if let Some(global) = &self.global_throttle {
global.wait(GLOBAL_THROTTLE_KEY).await;
}
self.throttle.wait(&host).await;
let started = Instant::now();
tracing::debug!(%url, %host, "probing");
let mut request = self.inner.get(&url);
if let Some(ua) = self.pick_user_agent() {
request = request.header(reqwest::header::USER_AGENT, ua);
}
let response = match request.send().await {
Ok(r) => r,
Err(err) => {
tracing::debug!(error = %err, "request failed");
return uncertain(
&site.name,
url,
started,
UncertainReason::Network(err.to_string()),
);
}
};
let status = response.status().as_u16();
let final_url = response.url().to_string();
if let Some(reason) = ban::detect_pre_body(status, response.headers()) {
tracing::warn!(%host, status, %reason, "ban-like response");
return uncertain(&site.name, url, started, reason);
}
let want_enrich = self.enrich && !site.extract.is_empty();
let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
let body = if needs_body {
match response.text().await {
Ok(b) => b,
Err(err) => {
return uncertain(
&site.name,
url,
started,
UncertainReason::BodyRead(err.to_string()),
);
}
}
} else {
String::new()
};
if !body.is_empty() {
if let Some(reason) = ban::detect_in_body(&body) {
tracing::warn!(%host, %reason, "ban-like body");
return uncertain(&site.name, url, started, reason);
}
}
let probe = Probe {
status,
final_url: &final_url,
body: &body,
};
let votes: Vec<(&Signal, SignalVerdict)> = site
.signals
.iter()
.map(|s| (s, s.evaluate(&probe)))
.collect();
let kind = aggregate(votes.iter().map(|(_, v)| *v));
let mut result = outcome(&site.name, url, started, kind);
let winning = match kind {
MatchKind::Found => Some(SignalVerdict::Found),
MatchKind::NotFound => Some(SignalVerdict::NotFound),
MatchKind::Uncertain => None,
};
if let Some(want) = winning {
result.evidence = votes
.iter()
.filter(|(_, v)| *v == want)
.map(|(s, _)| s.describe_match(&probe))
.collect();
}
if want_enrich && kind == MatchKind::Found {
result.enrichment = crate::enrich::extract(&body, &site.extract);
}
result
}
async fn probe_with_browser(
&self,
site: &Site,
url: &str,
backend: &dyn BrowserBackend,
) -> CheckOutcome {
let started = Instant::now();
let parsed = match url::Url::parse(url) {
Ok(u) => u,
Err(err) => {
return uncertain(
&site.name,
url.to_owned(),
started,
UncertainReason::Other(format!("invalid url: {err}")),
);
}
};
let page: RenderedPage = match backend
.fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
.await
{
Ok(p) => p,
Err(err) => {
tracing::warn!(site = %site.name, %url, error = %err, "browser fetch failed");
return uncertain(
&site.name,
url.to_owned(),
started,
UncertainReason::BrowserFailed(err.to_string()),
);
}
};
let final_url_str = page.final_url.as_str().to_owned();
let probe = Probe {
status: page.status,
final_url: &final_url_str,
body: &page.body,
};
let votes: Vec<(&Signal, SignalVerdict)> = site
.signals
.iter()
.map(|s| (s, s.evaluate(&probe)))
.collect();
let kind = aggregate(votes.iter().map(|(_, v)| *v));
let mut result = outcome(&site.name, url.to_owned(), started, kind);
let winning = match kind {
MatchKind::Found => Some(SignalVerdict::Found),
MatchKind::NotFound => Some(SignalVerdict::NotFound),
MatchKind::Uncertain => None,
};
if let Some(want) = winning {
result.evidence = votes
.iter()
.filter(|(_, v)| *v == want)
.map(|(s, _)| s.describe_match(&probe))
.collect();
}
if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
result.enrichment = crate::enrich::extract(&page.body, &site.extract);
}
result
}
}
#[derive(Debug, Clone)]
pub struct RawResponse {
pub status: u16,
pub final_url: String,
pub body: String,
}
#[derive(Clone)]
#[must_use = "ClientBuilder does nothing until `.build()` is called"]
pub struct ClientBuilder {
timeout: Duration,
connect_timeout: Duration,
user_agent: String,
follow_redirects: bool,
redirect_limit: usize,
min_request_interval: Duration,
max_rps: Option<NonZeroU32>,
retry: RetryPolicy,
proxy: Option<String>,
user_agents: Vec<String>,
enrich: bool,
respect_robots: bool,
browser: Option<Arc<dyn BrowserBackend>>,
browser_budget: usize,
}
impl Default for ClientBuilder {
fn default() -> Self {
Self {
timeout: DEFAULT_TIMEOUT,
connect_timeout: DEFAULT_CONNECT_TIMEOUT,
user_agent: default_user_agent(),
follow_redirects: true,
redirect_limit: DEFAULT_REDIRECT_LIMIT,
min_request_interval: DEFAULT_PER_HOST_INTERVAL,
max_rps: None,
retry: RetryPolicy::default(),
proxy: None,
user_agents: Vec::new(),
enrich: false,
respect_robots: false,
browser: None,
browser_budget: DEFAULT_BROWSER_BUDGET,
}
}
}
impl ClientBuilder {
pub fn timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
pub fn connect_timeout(mut self, timeout: Duration) -> Self {
self.connect_timeout = timeout;
self
}
pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
self.user_agent = user_agent.into();
self
}
pub fn follow_redirects(mut self, follow: bool) -> Self {
self.follow_redirects = follow;
self
}
pub fn min_request_interval(mut self, interval: Duration) -> Self {
self.min_request_interval = interval;
self
}
pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
self.max_rps = Some(rps);
self
}
pub fn max_retries(mut self, n: u32) -> Self {
self.retry.max_retries = n;
self
}
pub fn base_backoff_delay(mut self, d: Duration) -> Self {
self.retry.base_delay = d;
self
}
pub fn max_backoff_delay(mut self, d: Duration) -> Self {
self.retry.max_delay = d;
self
}
pub fn proxy(mut self, url: impl Into<String>) -> Self {
self.proxy = Some(url.into());
self
}
pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
self.user_agents = agents;
self
}
pub fn enrich(mut self, enrich: bool) -> Self {
self.enrich = enrich;
self
}
pub fn respect_robots(mut self, respect: bool) -> Self {
self.respect_robots = respect;
self
}
pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
self.browser = Some(backend);
self
}
pub const fn browser_budget(mut self, cap: usize) -> Self {
self.browser_budget = cap;
self
}
pub fn build(self) -> Result<Client> {
let redirect_policy = if self.follow_redirects {
redirect::Policy::limited(self.redirect_limit)
} else {
redirect::Policy::none()
};
let mut builder = reqwest::Client::builder()
.user_agent(self.user_agent)
.timeout(self.timeout)
.connect_timeout(self.connect_timeout)
.redirect(redirect_policy);
if let Some(proxy_url) = &self.proxy {
const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
return Err(Error::HttpSetup {
message: format!(
"invalid proxy {proxy_url:?}: must start with one of {}",
SCHEMES.join(", ")
),
});
}
let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
message: format!("invalid proxy {proxy_url:?}: {e}"),
})?;
builder = builder.proxy(proxy);
}
let inner = builder.build().map_err(|e| Error::HttpSetup {
message: e.to_string(),
})?;
let global_throttle = self.max_rps.map(|rps| {
let interval = Duration::from_secs(1) / rps.get();
HostThrottle::new(interval)
});
let robots = self
.respect_robots
.then(|| RobotsCache::new(inner.clone(), "adler"));
Ok(Client {
inner,
throttle: HostThrottle::new(self.min_request_interval),
global_throttle,
retry: self.retry,
user_agents: Arc::from(self.user_agents),
enrich: self.enrich,
robots,
browser: self.browser,
browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
})
}
}
pub const DEFAULT_BROWSER_BUDGET: usize = 50;
impl fmt::Debug for Client {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Client")
.field("throttle", &self.throttle)
.field("global_throttle", &self.global_throttle)
.field("retry", &self.retry)
.field("user_agents", &self.user_agents)
.field("enrich", &self.enrich)
.field("robots", &self.robots.is_some())
.field("browser", &self.browser.is_some())
.field("browser_budget", &self.browser_budget)
.finish_non_exhaustive()
}
}
impl fmt::Debug for ClientBuilder {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ClientBuilder")
.field("timeout", &self.timeout)
.field("connect_timeout", &self.connect_timeout)
.field("user_agent", &self.user_agent)
.field("follow_redirects", &self.follow_redirects)
.field("redirect_limit", &self.redirect_limit)
.field("min_request_interval", &self.min_request_interval)
.field("max_rps", &self.max_rps)
.field("retry", &self.retry)
.field("proxy", &self.proxy)
.field("user_agents", &self.user_agents)
.field("enrich", &self.enrich)
.field("respect_robots", &self.respect_robots)
.field("browser", &self.browser.is_some())
.field("browser_budget", &self.browser_budget)
.finish()
}
}
const BROWSER_TIMEOUT: Duration = Duration::from_secs(60);
const BOT_PROTECTED_TAG: &str = "bot-protected";
fn default_user_agent() -> String {
format!("adler/{}", env!("CARGO_PKG_VERSION"))
}
fn host_of(url: &str) -> String {
reqwest::Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(str::to_owned))
.unwrap_or_else(|| "unknown".into())
}
fn origin_and_path(url: &str) -> Option<(String, String)> {
let parsed = reqwest::Url::parse(url).ok()?;
let host = parsed.host_str()?;
let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
let origin = format!("{}://{host}{port}", parsed.scheme());
let path = parsed.query().map_or_else(
|| parsed.path().to_owned(),
|q| format!("{}?{q}", parsed.path()),
);
Some((origin, path))
}
fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
CheckOutcome {
site: site.to_owned(),
url,
kind,
reason: None,
elapsed_ms: elapsed_ms(started),
enrichment: std::collections::BTreeMap::new(),
evidence: Vec::new(),
}
}
fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
CheckOutcome {
site: site.to_owned(),
url,
kind: MatchKind::Uncertain,
reason: Some(reason),
elapsed_ms: elapsed_ms(started),
enrichment: std::collections::BTreeMap::new(),
evidence: Vec::new(),
}
}
fn elapsed_ms(started: Instant) -> u64 {
u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::site::{Signal, UrlTemplate};
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
fn build_client() -> Client {
Client::builder()
.timeout(Duration::from_secs(2))
.min_request_interval(Duration::ZERO)
.max_retries(0)
.build()
.expect("client builds")
}
fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
Site {
name: "Mock".into(),
url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
signals,
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
}
}
fn user() -> Username {
Username::new("alice").unwrap()
}
#[tokio::test]
async fn regex_check_short_circuits_before_any_request() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
site.regex_check = Some("^[A-Za-z]{8,}$".into());
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert!(
matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
"expected UsernameNotAllowed, got {:?}",
outcome.reason,
);
let recvd = server.received_requests().await.unwrap_or_default();
assert_eq!(
recvd.len(),
0,
"regex_check mismatch must skip the HTTP request entirely"
);
}
#[tokio::test]
async fn regex_check_pass_proceeds_to_probe() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
site.regex_check = Some("^[a-z]{3,}$".into());
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
}
#[tokio::test]
async fn status_signal_reports_found_on_match() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
assert!(outcome.url.ends_with("/alice"));
assert!(outcome.reason.is_none());
assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
}
#[tokio::test]
async fn status_signal_pair_reports_not_found_on_404() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let site = site_with(
&server,
vec![
Signal::StatusFound { codes: vec![200] },
Signal::StatusNotFound { codes: vec![404] },
],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::NotFound);
assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
}
#[tokio::test]
async fn body_absent_signal_detects_missing_account() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
.mount(&server)
.await;
let site = site_with(
&server,
vec![Signal::BodyAbsent {
text: "Profile not found".into(),
}],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::NotFound);
}
#[tokio::test]
async fn body_absent_alone_yields_uncertain_when_marker_missing() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
.mount(&server)
.await;
let site = site_with(
&server,
vec![Signal::BodyAbsent {
text: "Profile not found".into(),
}],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
}
#[tokio::test]
async fn body_present_plus_absent_resolve_to_found() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string(r#"<div class="profile-card">alice</div>"#),
)
.mount(&server)
.await;
let site = site_with(
&server,
vec![
Signal::BodyPresent {
text: "profile-card".into(),
},
Signal::BodyAbsent {
text: "Profile not found".into(),
},
],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
}
#[tokio::test]
async fn redirect_absent_signal_detects_missing_account() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(
ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/login"))
.respond_with(ResponseTemplate::new(200).set_body_string("login page"))
.mount(&server)
.await;
let site = site_with(
&server,
vec![Signal::RedirectAbsent {
fragment: "/login".into(),
}],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::NotFound);
}
#[tokio::test]
async fn negative_signal_wins_over_positive() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
.mount(&server)
.await;
let site = site_with(
&server,
vec![
Signal::StatusFound { codes: vec![200] },
Signal::BodyAbsent {
text: "Profile not found".into(),
},
],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::NotFound);
}
#[tokio::test]
async fn network_failure_yields_uncertain() {
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
let port = listener.local_addr().unwrap().port();
drop(listener);
let site = Site {
name: "Dead".into(),
url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let client = Client::builder()
.timeout(Duration::from_millis(500))
.connect_timeout(Duration::from_millis(500))
.max_retries(0)
.build()
.unwrap();
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert!(outcome.reason.is_some());
}
#[tokio::test]
async fn throttle_spaces_consecutive_calls_to_same_host() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let client = Client::builder()
.timeout(Duration::from_secs(2))
.min_request_interval(Duration::from_millis(300))
.build()
.unwrap();
client.check(&site, &user()).await;
let started = Instant::now();
client.check(&site, &user()).await;
let elapsed = started.elapsed();
assert!(
elapsed >= Duration::from_millis(200),
"second probe to the same host should wait ≥200 ms, got {elapsed:?}",
);
}
#[tokio::test]
async fn builder_overrides_user_agent() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let client = Client::builder()
.user_agent("adler-test/1.0")
.build()
.unwrap();
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
}
#[tokio::test]
async fn rate_limit_429_yields_uncertain_with_note() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(429))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
}
#[tokio::test]
async fn cloudflare_server_header_yields_uncertain() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
}
#[tokio::test]
async fn cloudflare_interstitial_in_body_yields_uncertain() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string("<html><head><title>Just a moment...</title></head></html>"),
)
.mount(&server)
.await;
let site = site_with(
&server,
vec![Signal::BodyAbsent {
text: "Profile not found".into(),
}],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
}
#[tokio::test]
async fn ban_detection_does_not_fire_on_legitimate_403() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(403))
.mount(&server)
.await;
let site = site_with(
&server,
vec![
Signal::StatusFound { codes: vec![200] },
Signal::StatusNotFound { codes: vec![403] },
],
);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::NotFound);
assert!(outcome.reason.is_none());
}
#[tokio::test]
async fn retry_recovers_after_transient_429() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(429))
.up_to_n_times(1)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let client = Client::builder()
.timeout(Duration::from_secs(2))
.min_request_interval(Duration::ZERO)
.max_retries(2)
.base_backoff_delay(Duration::from_millis(20))
.max_backoff_delay(Duration::from_millis(100))
.build()
.unwrap();
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
assert!(outcome.reason.is_none());
}
#[tokio::test]
async fn retry_exhausts_and_returns_uncertain() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(429))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let client = Client::builder()
.timeout(Duration::from_secs(2))
.min_request_interval(Duration::ZERO)
.max_retries(2)
.base_backoff_delay(Duration::from_millis(10))
.max_backoff_delay(Duration::from_millis(50))
.build()
.unwrap();
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
}
#[tokio::test]
async fn retry_does_not_fire_on_network_error() {
let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
let port = listener.local_addr().unwrap().port();
drop(listener);
let site = Site {
name: "Dead".into(),
url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let client = Client::builder()
.timeout(Duration::from_millis(500))
.connect_timeout(Duration::from_millis(500))
.min_request_interval(Duration::ZERO)
.max_retries(3)
.base_backoff_delay(Duration::from_secs(60))
.build()
.unwrap();
let started = Instant::now();
let outcome = client.check(&site, &user()).await;
assert!(started.elapsed() < Duration::from_secs(5));
assert_eq!(outcome.kind, MatchKind::Uncertain);
assert!(
matches!(outcome.reason, Some(UncertainReason::Network(_))),
"got {:?}",
outcome.reason,
);
}
#[tokio::test]
async fn rotates_user_agent_per_request() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.rotate_user_agents(vec!["RotatorUA/9.9".into()])
.build()
.unwrap();
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
}
#[test]
fn invalid_proxy_url_fails_build() {
let err = Client::builder().proxy("not a url").build().unwrap_err();
assert!(matches!(err, Error::HttpSetup { .. }));
}
#[test]
fn schemeless_proxy_is_rejected_up_front() {
let err = Client::builder().proxy("not-a-url").build().unwrap_err();
let Error::HttpSetup { message } = err else {
panic!("expected HttpSetup, got {err:?}");
};
assert!(message.contains("must start with"), "{message}");
}
#[test]
fn socks5_proxy_scheme_is_accepted() {
assert!(
Client::builder()
.proxy("socks5://127.0.0.1:9050")
.build()
.is_ok()
);
}
#[tokio::test]
async fn global_rps_cap_spaces_requests_across_hosts() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let site_a = Site {
name: "A".into(),
url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let site_b = Site {
name: "B".into(),
url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.max_rps(std::num::NonZeroU32::new(2).unwrap())
.build()
.unwrap();
client.check(&site_a, &user()).await;
let started = Instant::now();
client.check(&site_b, &user()).await;
assert!(
started.elapsed() >= Duration::from_millis(350),
"global cap should space cross-host requests, got {:?}",
started.elapsed(),
);
}
#[tokio::test]
async fn respect_robots_skips_disallowed_paths() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/robots.txt"))
.respond_with(
ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
)
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/no/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/yes/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.respect_robots(true)
.build()
.unwrap();
let disallowed = Site {
name: "No".into(),
url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let allowed = Site {
name: "Yes".into(),
url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
signals: vec![Signal::StatusFound { codes: vec![200] }],
known_present: None,
known_absent: None,
extract: Vec::new(),
tags: Vec::new(),
request_headers: std::collections::BTreeMap::new(),
regex_check: None,
engine: None,
};
let no = client.check(&disallowed, &user()).await;
assert_eq!(no.kind, MatchKind::Uncertain);
assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
let yes = client.check(&allowed, &user()).await;
assert_eq!(yes.kind, MatchKind::Found);
}
#[tokio::test]
async fn body_read_skipped_when_no_body_signal_needed() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
.mount(&server)
.await;
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let outcome = build_client().check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
}
#[derive(Debug)]
struct RecordingBackend {
page: RenderedPage,
calls: std::sync::atomic::AtomicUsize,
}
impl RecordingBackend {
fn with_page(page: RenderedPage) -> Self {
Self {
page,
calls: std::sync::atomic::AtomicUsize::new(0),
}
}
fn call_count(&self) -> usize {
self.calls.load(std::sync::atomic::Ordering::SeqCst)
}
}
#[async_trait::async_trait]
impl BrowserBackend for RecordingBackend {
async fn fetch(
&self,
_url: &url::Url,
_headers: &std::collections::BTreeMap<String, String>,
_timeout: Duration,
) -> Result<RenderedPage> {
self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
Ok(self.page.clone())
}
}
fn site_bot_protected(server: &MockServer) -> Site {
let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
s.tags = vec!["bot-protected".into()];
s
}
#[tokio::test]
async fn browser_routes_bot_protected_sites() {
let server = MockServer::start().await;
let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
status: 200,
final_url: url::Url::parse("https://example.com/alice").unwrap(),
body: "<html></html>".into(),
elapsed_ms: 42,
}));
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.browser(backend.clone())
.build()
.unwrap();
let outcome = client.check(&site_bot_protected(&server), &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
}
#[tokio::test]
async fn non_bot_protected_sites_skip_browser() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/alice"))
.respond_with(ResponseTemplate::new(200))
.mount(&server)
.await;
let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
status: 500, final_url: url::Url::parse("https://x/").unwrap(),
body: String::new(),
elapsed_ms: 0,
}));
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.browser(backend.clone())
.build()
.unwrap();
let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
let outcome = client.check(&site, &user()).await;
assert_eq!(outcome.kind, MatchKind::Found);
assert_eq!(backend.call_count(), 0, "browser must not be touched");
}
#[tokio::test]
async fn browser_budget_exhaust_yields_uncertain() {
let server = MockServer::start().await;
let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
status: 200,
final_url: url::Url::parse("https://x/").unwrap(),
body: String::new(),
elapsed_ms: 0,
}));
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.browser(backend.clone())
.browser_budget(1)
.build()
.unwrap();
let site = site_bot_protected(&server);
let first = client.check(&site, &user()).await;
assert_eq!(first.kind, MatchKind::Found);
let second = client.check(&site, &user()).await;
assert_eq!(second.kind, MatchKind::Uncertain);
assert!(matches!(
second.reason,
Some(UncertainReason::BrowserBudget)
));
assert_eq!(
backend.call_count(),
1,
"second call must not invoke backend"
);
}
#[tokio::test]
async fn browser_failure_surfaces_as_uncertain_browser_failed() {
struct FailingBackend;
#[async_trait::async_trait]
impl BrowserBackend for FailingBackend {
async fn fetch(
&self,
_url: &url::Url,
_headers: &std::collections::BTreeMap<String, String>,
_timeout: Duration,
) -> Result<RenderedPage> {
Err(Error::BrowserSetup {
message: "simulated crash".into(),
})
}
}
impl std::fmt::Debug for FailingBackend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("FailingBackend")
}
}
let server = MockServer::start().await;
let client = Client::builder()
.min_request_interval(Duration::ZERO)
.max_retries(0)
.browser(Arc::new(FailingBackend))
.build()
.unwrap();
let outcome = client.check(&site_bot_protected(&server), &user()).await;
assert_eq!(outcome.kind, MatchKind::Uncertain);
match outcome.reason {
Some(UncertainReason::BrowserFailed(msg)) => {
assert!(msg.contains("simulated crash"), "got: {msg}");
}
other => panic!("expected BrowserFailed, got {other:?}"),
}
}
}