adler_core/client/probe.rs
1//! Per-site probe path: routing, ban-retry, escalation, finish.
2//!
3//! Hosts the methods on [`Client`] that turn one `(site, username)`
4//! pair into a [`CheckOutcome`]: the public entry point
5//! [`Client::check`], the request-issuing path [`Client::probe_once`]
6//! (browser routing → impersonate-fingerprint → egress selection →
7//! HTTP fetch), the HTTP→browser escalation in [`Client::maybe_escalate`]
8//! when a cheap-path response merits a second look, and the
9//! signal-evaluation [`Client::finish`] that turns a raw response
10//! into a final outcome. Also hosts the two diagnostic fetch helpers
11//! (`fetch`, `fetch_for_doctor`) used by `adler --doctor --fix`.
12//!
13//! Construction lives in `client::builder`; accessors and
14//! [`Client::with_egress_subset`] stay in `client::mod`.
15
16use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::Instant;
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::retry;
24use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
25use crate::transport::{
26 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
27};
28use crate::username::Username;
29
30use super::util::{host_of, origin_and_path, outcome, uncertain};
31use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
32
33impl Client {
34 /// Probe a single site for `username`, retrying on transient bans.
35 ///
36 /// Network failures, timeouts, and unexpected response shapes all yield
37 /// [`MatchKind::Uncertain`] with a descriptive note. The method never
38 /// returns an error: at the executor level we want a partial result for
39 /// every site, not abort-on-first-failure semantics.
40 ///
41 /// When ban detection classifies a response as `rate_limited` /
42 /// `cloudflare_challenge`, the call is retried with jittered exponential
43 /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
44 /// Uncertain (network errors, body read failures) is **not** retried —
45 /// those failures rarely fix themselves in the seconds-to-minutes window
46 /// we'd block for.
47 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
48 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
49 let mut attempt: u32 = 0;
50 loop {
51 let outcome = self.probe_once(site, username).await;
52 if !retry::should_retry(&outcome, attempt, &self.retry) {
53 return outcome;
54 }
55 let delay = retry::backoff_delay(attempt, &self.retry);
56 tracing::info!(
57 site = %site.name,
58 attempt = attempt + 1,
59 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
60 ?delay,
61 "transient ban, retrying",
62 );
63 tokio::time::sleep(delay).await;
64 attempt += 1;
65 }
66 }
67
68 /// Fetch a URL and return raw response data (status, final URL, body)
69 /// with the same throttle / User-Agent / proxy machinery as `check`,
70 /// but without signal evaluation or retry.
71 ///
72 /// Returns `None` on any network/transport error. Intended for
73 /// diagnostics such as `adler --doctor --fix`, which diffs the
74 /// responses for a known-present and a nonsense user to derive a
75 /// signature.
76 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
77 let host = host_of(url);
78 if let Some(global) = &self.global_throttle {
79 global.wait(GLOBAL_THROTTLE_KEY).await;
80 }
81 self.throttle.wait(&host).await;
82 let mut request = self.http.client().get(url);
83 if let Some(ua) = self.pick_user_agent() {
84 request = request.header(reqwest::header::USER_AGENT, ua);
85 }
86 let response = request.send().await.ok()?;
87 let status = response.status().as_u16();
88 let final_url = response.url().to_string();
89 let body = response.text().await.unwrap_or_default();
90 Some(RawResponse {
91 status,
92 final_url,
93 body,
94 })
95 }
96
97 /// Same as [`Self::fetch`] but routes through the configured browser
98 /// backend when the site is tagged `bot-protected` and a backend is
99 /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
100 /// so that the diff-derivation works against the JS-rendered page
101 /// (login wall vs. real profile) rather than two identical raw-HTTP
102 /// shells.
103 ///
104 /// Falls back to raw HTTP if (a) no browser is configured, (b) the
105 /// site isn't `bot-protected`, or (c) the browser fetch fails — so
106 /// callers get the same `Option<RawResponse>` shape either way.
107 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
108 if let Some(backend) = self.browser.as_deref() {
109 let has_tag = site
110 .tags
111 .iter()
112 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
113 if has_tag || !site.protection.is_empty() {
114 let parsed = url::Url::parse(url).ok()?;
115 match backend
116 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
117 .await
118 {
119 Ok(page) => {
120 return Some(RawResponse {
121 status: page.status,
122 final_url: page.final_url.to_string(),
123 body: page.body,
124 });
125 }
126 Err(err) => {
127 tracing::warn!(
128 site = %site.name, %url, error = %err,
129 "browser fetch failed in doctor; falling back to raw HTTP",
130 );
131 }
132 }
133 }
134 }
135 self.fetch(url).await
136 }
137
138 /// Pick a User-Agent for the next request from the rotation pool, or
139 /// `None` to fall back on the client's fixed header.
140 fn pick_user_agent(&self) -> Option<&str> {
141 match self.user_agents.len() {
142 0 => None,
143 1 => Some(&self.user_agents[0]),
144 n => Some(&self.user_agents[fastrand::usize(0..n)]),
145 }
146 }
147
148 // Splitting probe_once into helpers would scatter the request/response
149 // flow that has to read top-to-bottom; one long function reads better.
150 #[allow(clippy::too_many_lines)]
151 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
152 let url = site.url_for(username);
153
154 // Site-level username constraint (Sherlock's `regexCheck`).
155 // Mismatch → skip the probe entirely. Saves a request and
156 // sidesteps the false-positive class where a site 404s on
157 // illegal usernames in a way our signal can't distinguish
158 // from a missing account. If the pattern fails to compile
159 // (Sherlock occasionally uses lookarounds, which our `regex`
160 // crate can't express), we let validate's warn-log stand
161 // and silently fall through — the rest of the probe still
162 // works.
163 if let Some(pat) = &site.regex_check {
164 if let Ok(re) = regex::Regex::new(pat) {
165 if !re.is_match(username.as_str()) {
166 return uncertain(
167 &site.name,
168 url,
169 Instant::now(),
170 UncertainReason::UsernameNotAllowed,
171 );
172 }
173 }
174 }
175
176 // Resolve an operator session if the site's access policy names
177 // one, and fold its headers (cookies / tokens) over the site's
178 // own. A named-but-missing session is reported rather than sent
179 // unauthenticated into a login wall — which reads identically
180 // for an existing and a missing account. Applies to both the
181 // HTTP and browser transports.
182 let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
183 None => Cow::Borrowed(&site.request_headers),
184 Some(name) => match self.sessions.get(name) {
185 Some(session) => Cow::Owned(session.apply(&site.request_headers)),
186 None => {
187 return uncertain(
188 &site.name,
189 url,
190 Instant::now(),
191 UncertainReason::SessionRequired,
192 );
193 }
194 },
195 };
196 let headers: &BTreeMap<String, String> = &session_headers;
197
198 // Auto-route bot-protected sites through the browser backend when
199 // one is configured. Raw HTTP can't see past their JS/login wall,
200 // so this is the only way they ever produce a Found verdict.
201 // A site is "bot-protected" in the routing sense if it carries
202 // the legacy tag OR declares any specific protection mechanism
203 // via the new `protection` field — either signal is enough.
204 if let Some(backend) = &self.browser {
205 let has_tag = site
206 .tags
207 .iter()
208 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
209 if has_tag || !site.protection.is_empty() {
210 if self.browser_budget.try_consume() {
211 let started = Instant::now();
212 let req = FetchRequest {
213 method: site.request_method,
214 url: &url,
215 body: None,
216 user_agent: None,
217 headers,
218 want_body: true,
219 };
220 let fetcher = BrowserFetcher::new(Arc::clone(backend));
221 let mut outcome = match fetcher.fetch(&req).await {
222 Ok(resp) => self.finish(site, url, started, &resp),
223 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
224 };
225 outcome.transport = Some(crate::escalation::TransportTier::Browser);
226 return outcome;
227 }
228 tracing::warn!(site = %site.name, "browser budget exhausted");
229 let mut outcome = uncertain(
230 &site.name,
231 url,
232 Instant::now(),
233 UncertainReason::BrowserBudget,
234 );
235 outcome.transport = Some(crate::escalation::TransportTier::Browser);
236 return outcome;
237 }
238 }
239
240 // Phase 2: route pure-`TlsFingerprint` sites through the
241 // impersonating transport — a real BoringSSL TLS handshake from
242 // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
243 // protection tag, at a fraction of the cost of a real browser.
244 // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
245 // keep going through the browser path above, where they were.
246 #[cfg(feature = "impersonate")]
247 if let Some(fetcher) = &self.impersonate {
248 let pure_tls = site.protection.len() == 1
249 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
250 && !site
251 .tags
252 .iter()
253 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
254 if pure_tls {
255 let started = Instant::now();
256 let req = FetchRequest {
257 method: site.request_method,
258 url: &url,
259 body: None,
260 user_agent: self.pick_user_agent(),
261 headers,
262 want_body: true,
263 };
264 let mut primary = match fetcher.fetch(&req).await {
265 Ok(resp) => self.finish(site, url.clone(), started, &resp),
266 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
267 };
268 primary.transport = Some(crate::escalation::TransportTier::Impersonate);
269 return self.maybe_escalate(site, &url, headers, primary).await;
270 }
271 }
272
273 // Egress selection: route the HTTP path through a geo / IP-type
274 // matching proxy when the site's access policy demands one. An
275 // unconstrained policy uses the default egress; a constrained
276 // policy with no matching egress is reported `GeoUnavailable`
277 // rather than fetched from the wrong location (a false
278 // `NotFound` would be worse than an honest `Uncertain`).
279 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
280 EgressChoice::Default => Arc::clone(&self.http),
281 EgressChoice::Use(fetcher) => fetcher,
282 EgressChoice::Unavailable => {
283 return uncertain(
284 &site.name,
285 url,
286 Instant::now(),
287 UncertainReason::GeoUnavailable,
288 );
289 }
290 };
291
292 let host = host_of(&url);
293
294 // robots.txt gate, before consuming a throttle slot or probing.
295 if let Some(robots) = &self.robots {
296 if let Some((origin, path)) = origin_and_path(&url) {
297 if !robots.allowed(&origin, &path).await {
298 tracing::debug!(%url, "skipped by robots.txt");
299 return uncertain(
300 &site.name,
301 url,
302 Instant::now(),
303 UncertainReason::RobotsDisallowed,
304 );
305 }
306 }
307 }
308
309 // Global cap first (gates every request), then per-host spacing.
310 if let Some(global) = &self.global_throttle {
311 global.wait(GLOBAL_THROTTLE_KEY).await;
312 }
313 self.throttle.wait(&host).await;
314 let started = Instant::now();
315 tracing::debug!(%url, %host, "probing");
316
317 // Read the body only if a signal needs it, or enrichment is on
318 // and the site declares extractor rules (extraction needs it).
319 let want_enrich = self.enrich && !site.extract.is_empty();
320 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
321
322 // POST sites carry their own body payload (the username goes in
323 // the body, not the URL — e.g. Anilist's GraphQL endpoint).
324 // `{username}` in `Site::request_body` is substituted here,
325 // mirroring URL substitution.
326 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
327 const USERNAME_PH: &str = "{username}";
328 site.request_body
329 .as_deref()
330 .map(|t| t.replace(USERNAME_PH, username.as_str()))
331 } else {
332 None
333 };
334
335 let req = FetchRequest {
336 method: site.request_method,
337 url: &url,
338 body: body_for_post.as_deref(),
339 user_agent: self.pick_user_agent(),
340 headers,
341 want_body: needs_body,
342 };
343 let mut primary = match egress.fetch(&req).await {
344 Ok(resp) => self.finish(site, url.clone(), started, &resp),
345 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
346 };
347 primary.transport = Some(crate::escalation::TransportTier::Http);
348 self.maybe_escalate(site, &url, headers, primary).await
349 }
350
351 /// If the cheap transport returned an `Uncertain` reason a browser
352 /// fetch could plausibly resolve, retry through the browser backend
353 /// and stamp the new outcome as escalated. Bounded by
354 /// [`escalation_budget`](ClientBuilder::escalation_budget).
355 async fn maybe_escalate(
356 &self,
357 site: &Site,
358 url: &str,
359 headers: &BTreeMap<String, String>,
360 primary: CheckOutcome,
361 ) -> CheckOutcome {
362 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
363 return primary;
364 }
365 let Some(reason) = &primary.reason else {
366 return primary;
367 };
368 if !crate::escalation::should_escalate(reason) {
369 return primary;
370 }
371 let Some(backend) = &self.browser else {
372 return primary;
373 };
374 if !self.escalation_budget.try_consume() {
375 tracing::debug!(site = %site.name, "escalation budget exhausted");
376 return primary;
377 }
378
379 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
380 let started = Instant::now();
381 let req = FetchRequest {
382 method: site.request_method,
383 url,
384 body: None,
385 user_agent: None,
386 headers,
387 want_body: true,
388 };
389 let fetcher = BrowserFetcher::new(Arc::clone(backend));
390 let mut escalated = match fetcher.fetch(&req).await {
391 Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
392 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
393 };
394 escalated.transport = Some(crate::escalation::TransportTier::Browser);
395 escalated.escalations = 1;
396 escalated
397 }
398
399 /// Evaluate a fetched response against the site's signals and build
400 /// the outcome. Shared by the HTTP and browser transports so the
401 /// verdict / evidence / enrichment logic lives in exactly one place.
402 fn finish(
403 &self,
404 site: &Site,
405 url: String,
406 started: Instant,
407 resp: &crate::transport::FetchResponse,
408 ) -> CheckOutcome {
409 let probe = Probe {
410 status: resp.status,
411 final_url: &resp.final_url,
412 body: &resp.body,
413 };
414 let votes: Vec<(&Signal, SignalVerdict)> = site
415 .signals
416 .iter()
417 .map(|s| (s, s.evaluate(&probe)))
418 .collect();
419 let kind = aggregate(votes.iter().map(|(_, v)| *v));
420 let mut result = outcome(&site.name, url, started, kind);
421 // Record which signals produced the verdict (the winning polarity).
422 let winning = match kind {
423 MatchKind::Found => Some(SignalVerdict::Found),
424 MatchKind::NotFound => Some(SignalVerdict::NotFound),
425 MatchKind::Uncertain => None,
426 };
427 if let Some(want) = winning {
428 result.evidence = votes
429 .iter()
430 .filter(|(_, v)| *v == want)
431 .map(|(s, _)| s.describe_match(&probe))
432 .collect();
433 }
434 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
435 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
436 }
437 result
438 }
439}