adler_core/client/probe.rs
1//! Per-site probe path: routing, ban-retry, escalation, finish.
2//!
3//! Hosts the methods on [`Client`] that turn one `(site, username)`
4//! pair into a [`CheckOutcome`]: the public entry point
5//! [`Client::check`], the request-issuing path [`Client::probe_once`]
6//! (browser routing → impersonate-fingerprint → egress selection →
7//! HTTP fetch), the HTTP→browser escalation in [`Client::maybe_escalate`]
8//! when a cheap-path response merits a second look, and the
9//! signal-evaluation [`Client::finish`] that turns a raw response
10//! into a final outcome. Also hosts the two diagnostic fetch helpers
11//! (`fetch`, `fetch_for_doctor`) used by `adler --doctor --fix`.
12//!
13//! Construction lives in `client::builder`; accessors and
14//! [`Client::with_egress_subset`] stay in `client::mod`.
15
16use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::Instant;
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::retry;
24use crate::site::{HttpMethod, Probe, ProtectionKind, Signal, SignalVerdict, Site, aggregate};
25use crate::transport::{
26 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
27};
28use crate::username::Username;
29
30use super::util::{host_of, origin_and_path, outcome, uncertain};
31use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
32
33fn routes_through_browser(site: &Site) -> bool {
34 site.tags
35 .iter()
36 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
37 || site
38 .protection
39 .iter()
40 .any(|p| !matches!(p, ProtectionKind::UserAuth))
41}
42
43impl Client {
44 /// Probe a single site for `username`, retrying on transient bans.
45 ///
46 /// Network failures, timeouts, and unexpected response shapes all yield
47 /// [`MatchKind::Uncertain`] with a descriptive note. The method never
48 /// returns an error: at the executor level we want a partial result for
49 /// every site, not abort-on-first-failure semantics.
50 ///
51 /// When ban detection classifies a response as `rate_limited` /
52 /// `cloudflare_challenge`, the call is retried with jittered exponential
53 /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
54 /// Uncertain (network errors, body read failures) is **not** retried —
55 /// those failures rarely fix themselves in the seconds-to-minutes window
56 /// we'd block for.
57 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
58 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
59 let mut attempt: u32 = 0;
60 loop {
61 let outcome = self.probe_once(site, username).await;
62 if !retry::should_retry(&outcome, attempt, &self.retry) {
63 return outcome;
64 }
65 let delay = retry::backoff_delay(attempt, &self.retry);
66 tracing::info!(
67 site = %site.name,
68 attempt = attempt + 1,
69 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
70 ?delay,
71 "transient ban, retrying",
72 );
73 tokio::time::sleep(delay).await;
74 attempt += 1;
75 }
76 }
77
78 /// Fetch a URL and return raw response data (status, final URL, body)
79 /// with the same throttle / User-Agent / proxy machinery as `check`,
80 /// but without signal evaluation or retry.
81 ///
82 /// Returns `None` on any network/transport error. Intended for
83 /// diagnostics such as `adler --doctor --fix`, which diffs the
84 /// responses for a known-present and a nonsense user to derive a
85 /// signature.
86 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
87 let host = host_of(url);
88 if let Some(global) = &self.global_throttle {
89 global.wait(GLOBAL_THROTTLE_KEY).await;
90 }
91 self.throttle.wait(&host).await;
92 let mut request = self.http.client().get(url);
93 if let Some(ua) = self.pick_user_agent() {
94 request = request.header(reqwest::header::USER_AGENT, ua);
95 }
96 let response = request.send().await.ok()?;
97 let status = response.status().as_u16();
98 let final_url = response.url().to_string();
99 let body = response.text().await.unwrap_or_default();
100 Some(RawResponse {
101 status,
102 final_url,
103 body,
104 })
105 }
106
107 /// Same as [`Self::fetch`] but routes through the configured browser
108 /// backend when the site is tagged `bot-protected` and a backend is
109 /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
110 /// so that the diff-derivation works against the JS-rendered page
111 /// (login wall vs. real profile) rather than two identical raw-HTTP
112 /// shells.
113 ///
114 /// Falls back to raw HTTP if (a) no browser is configured, (b) the
115 /// site isn't `bot-protected`, or (c) the browser fetch fails — so
116 /// callers get the same `Option<RawResponse>` shape either way.
117 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
118 if let Some(backend) = self.browser.as_deref() {
119 if routes_through_browser(site) {
120 let parsed = url::Url::parse(url).ok()?;
121 match backend
122 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
123 .await
124 {
125 Ok(page) => {
126 return Some(RawResponse {
127 status: page.status,
128 final_url: page.final_url.to_string(),
129 body: page.body,
130 });
131 }
132 Err(err) => {
133 tracing::warn!(
134 site = %site.name, %url, error = %err,
135 "browser fetch failed in doctor; falling back to raw HTTP",
136 );
137 }
138 }
139 }
140 }
141 self.fetch(url).await
142 }
143
144 /// Pick a User-Agent for the next request from the rotation pool, or
145 /// `None` to fall back on the client's fixed header.
146 fn pick_user_agent(&self) -> Option<&str> {
147 match self.user_agents.len() {
148 0 => None,
149 1 => Some(&self.user_agents[0]),
150 n => Some(&self.user_agents[fastrand::usize(0..n)]),
151 }
152 }
153
154 // Splitting probe_once into helpers would scatter the request/response
155 // flow that has to read top-to-bottom; one long function reads better.
156 #[allow(clippy::too_many_lines)]
157 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
158 let url = site.url_for(username);
159
160 // Site-level username constraint (Sherlock's `regexCheck`).
161 // Mismatch → skip the probe entirely. Saves a request and
162 // sidesteps the false-positive class where a site 404s on
163 // illegal usernames in a way our signal can't distinguish
164 // from a missing account. If the pattern fails to compile
165 // (Sherlock occasionally uses lookarounds, which our `regex`
166 // crate can't express), we let validate's warn-log stand
167 // and silently fall through — the rest of the probe still
168 // works.
169 if let Some(pat) = &site.regex_check {
170 if let Ok(re) = regex::Regex::new(pat) {
171 if !re.is_match(username.as_str()) {
172 return uncertain(
173 &site.name,
174 url,
175 Instant::now(),
176 UncertainReason::UsernameNotAllowed,
177 );
178 }
179 }
180 }
181
182 // Resolve an operator session if the site's access policy names
183 // one, and fold its headers (cookies / tokens) over the site's
184 // own. A named-but-missing session is reported rather than sent
185 // unauthenticated into a login wall — which reads identically
186 // for an existing and a missing account. Applies to both the
187 // HTTP and browser transports.
188 let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
189 None => Cow::Borrowed(&site.request_headers),
190 Some(name) => match self.sessions.get(name) {
191 Some(session) => Cow::Owned(session.apply(&site.request_headers)),
192 None => {
193 return uncertain(
194 &site.name,
195 url,
196 Instant::now(),
197 UncertainReason::SessionRequired,
198 );
199 }
200 },
201 };
202 let headers: &BTreeMap<String, String> = &session_headers;
203
204 // Auto-route bot-protected sites through the browser backend when
205 // one is configured. Raw HTTP can't see past their JS/login wall,
206 // so this is the only way they ever produce a Found verdict.
207 // A site is "bot-protected" in the routing sense if it carries
208 // the legacy tag OR declares any specific protection mechanism
209 // via the new `protection` field — either signal is enough.
210 if let Some(backend) = &self.browser {
211 if routes_through_browser(site) {
212 if self.browser_budget.try_consume() {
213 let started = Instant::now();
214 let req = FetchRequest {
215 method: site.request_method,
216 url: &url,
217 body: None,
218 user_agent: None,
219 headers,
220 want_body: true,
221 };
222 let fetcher = BrowserFetcher::new(Arc::clone(backend));
223 let mut outcome = match fetcher.fetch(&req).await {
224 Ok(resp) => self.finish(site, url, started, &resp),
225 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
226 };
227 outcome.transport = Some(crate::escalation::TransportTier::Browser);
228 return outcome;
229 }
230 tracing::warn!(site = %site.name, "browser budget exhausted");
231 let mut outcome = uncertain(
232 &site.name,
233 url,
234 Instant::now(),
235 UncertainReason::BrowserBudget,
236 );
237 outcome.transport = Some(crate::escalation::TransportTier::Browser);
238 return outcome;
239 }
240 }
241
242 // Phase 2: route pure-`TlsFingerprint` sites through the
243 // impersonating transport — a real BoringSSL TLS handshake from
244 // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
245 // protection tag, at a fraction of the cost of a real browser.
246 // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
247 // keep going through the browser path above, where they were.
248 #[cfg(feature = "impersonate")]
249 if let Some(fetcher) = &self.impersonate {
250 let pure_tls = site.protection.len() == 1
251 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
252 && !site
253 .tags
254 .iter()
255 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
256 if pure_tls {
257 let started = Instant::now();
258 let req = FetchRequest {
259 method: site.request_method,
260 url: &url,
261 body: None,
262 user_agent: self.pick_user_agent(),
263 headers,
264 want_body: true,
265 };
266 let mut primary = match fetcher.fetch(&req).await {
267 Ok(resp) => self.finish(site, url.clone(), started, &resp),
268 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
269 };
270 primary.transport = Some(crate::escalation::TransportTier::Impersonate);
271 return self.maybe_escalate(site, &url, headers, primary).await;
272 }
273 }
274
275 // Egress selection: route the HTTP path through a geo / IP-type
276 // matching proxy when the site's access policy demands one. An
277 // unconstrained policy uses the default egress; a constrained
278 // policy with no matching egress is reported `GeoUnavailable`
279 // rather than fetched from the wrong location (a false
280 // `NotFound` would be worse than an honest `Uncertain`).
281 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
282 EgressChoice::Default => Arc::clone(&self.http),
283 EgressChoice::Use(fetcher) => fetcher,
284 EgressChoice::Unavailable => {
285 return uncertain(
286 &site.name,
287 url,
288 Instant::now(),
289 UncertainReason::GeoUnavailable,
290 );
291 }
292 };
293
294 let host = host_of(&url);
295
296 // robots.txt gate, before consuming a throttle slot or probing.
297 if let Some(robots) = &self.robots {
298 if let Some((origin, path)) = origin_and_path(&url) {
299 if !robots.allowed(&origin, &path).await {
300 tracing::debug!(%url, "skipped by robots.txt");
301 return uncertain(
302 &site.name,
303 url,
304 Instant::now(),
305 UncertainReason::RobotsDisallowed,
306 );
307 }
308 }
309 }
310
311 // Global cap first (gates every request), then per-host spacing.
312 if let Some(global) = &self.global_throttle {
313 global.wait(GLOBAL_THROTTLE_KEY).await;
314 }
315 self.throttle.wait(&host).await;
316 let started = Instant::now();
317 tracing::debug!(%url, %host, "probing");
318
319 // Read the body only if a signal needs it, or enrichment is on
320 // and the site declares extractor rules (extraction needs it).
321 let want_enrich = self.enrich && !site.extract.is_empty();
322 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
323
324 // POST sites carry their own body payload (the username goes in
325 // the body, not the URL — e.g. Anilist's GraphQL endpoint).
326 // `{username}` in `Site::request_body` is substituted here,
327 // mirroring URL substitution.
328 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
329 const USERNAME_PH: &str = "{username}";
330 site.request_body
331 .as_deref()
332 .map(|t| t.replace(USERNAME_PH, username.as_str()))
333 } else {
334 None
335 };
336
337 let req = FetchRequest {
338 method: site.request_method,
339 url: &url,
340 body: body_for_post.as_deref(),
341 user_agent: self.pick_user_agent(),
342 headers,
343 want_body: needs_body,
344 };
345 let mut primary = match egress.fetch(&req).await {
346 Ok(resp) => self.finish(site, url.clone(), started, &resp),
347 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
348 };
349 primary.transport = Some(crate::escalation::TransportTier::Http);
350 self.maybe_escalate(site, &url, headers, primary).await
351 }
352
353 /// If the cheap transport returned an `Uncertain` reason a browser
354 /// fetch could plausibly resolve, retry through the browser backend
355 /// and stamp the new outcome as escalated. Bounded by
356 /// [`escalation_budget`](ClientBuilder::escalation_budget).
357 async fn maybe_escalate(
358 &self,
359 site: &Site,
360 url: &str,
361 headers: &BTreeMap<String, String>,
362 primary: CheckOutcome,
363 ) -> CheckOutcome {
364 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
365 return primary;
366 }
367 let Some(reason) = &primary.reason else {
368 return primary;
369 };
370 if !crate::escalation::should_escalate(reason) {
371 return primary;
372 }
373 let Some(backend) = &self.browser else {
374 return primary;
375 };
376 if !self.escalation_budget.try_consume() {
377 tracing::debug!(site = %site.name, "escalation budget exhausted");
378 return primary;
379 }
380
381 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
382 let started = Instant::now();
383 let req = FetchRequest {
384 method: site.request_method,
385 url,
386 body: None,
387 user_agent: None,
388 headers,
389 want_body: true,
390 };
391 let fetcher = BrowserFetcher::new(Arc::clone(backend));
392 let mut escalated = match fetcher.fetch(&req).await {
393 Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
394 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
395 };
396 escalated.transport = Some(crate::escalation::TransportTier::Browser);
397 escalated.escalations = 1;
398 escalated
399 }
400
401 /// Evaluate a fetched response against the site's signals and build
402 /// the outcome. Shared by the HTTP and browser transports so the
403 /// verdict / evidence / enrichment logic lives in exactly one place.
404 fn finish(
405 &self,
406 site: &Site,
407 url: String,
408 started: Instant,
409 resp: &crate::transport::FetchResponse,
410 ) -> CheckOutcome {
411 let probe = Probe {
412 status: resp.status,
413 final_url: &resp.final_url,
414 body: &resp.body,
415 };
416 let votes: Vec<(&Signal, SignalVerdict)> = site
417 .signals
418 .iter()
419 .map(|s| (s, s.evaluate(&probe)))
420 .collect();
421 let kind = aggregate(votes.iter().map(|(_, v)| *v));
422 let mut result = outcome(&site.name, url, started, kind);
423 // Record which signals produced the verdict (the winning polarity).
424 let winning = match kind {
425 MatchKind::Found => Some(SignalVerdict::Found),
426 MatchKind::NotFound => Some(SignalVerdict::NotFound),
427 MatchKind::Uncertain => None,
428 };
429 if let Some(want) = winning {
430 result.evidence = votes
431 .iter()
432 .filter(|(_, v)| *v == want)
433 .map(|(s, _)| s.describe_match(&probe))
434 .collect();
435 }
436 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
437 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
438 }
439 result
440 }
441}