Skip to main content

adler_core/client/
probe.rs

1//! Per-site probe path: routing, ban-retry, escalation, finish.
2//!
3//! Hosts the methods on [`Client`] that turn one `(site, username)`
4//! pair into a [`CheckOutcome`]: the public entry point
5//! [`Client::check`], the request-issuing path [`Client::probe_once`]
6//! (browser routing → impersonate-fingerprint → egress selection →
7//! HTTP fetch), the HTTP→browser escalation in [`Client::maybe_escalate`]
8//! when a cheap-path response merits a second look, and the
9//! signal-evaluation [`Client::finish`] that turns a raw response
10//! into a final outcome. Also hosts the two diagnostic fetch helpers
11//! (`fetch`, `fetch_for_doctor`) used by `adler --doctor --fix`.
12//!
13//! Construction lives in `client::builder`; accessors and
14//! [`Client::with_egress_subset`] stay in `client::mod`.
15
16use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::Instant;
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::retry;
24use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
25use crate::transport::{
26    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
27};
28use crate::username::Username;
29
30use super::util::{host_of, origin_and_path, outcome, uncertain};
31use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
32
33impl Client {
34    /// Probe a single site for `username`, retrying on transient bans.
35    ///
36    /// Network failures, timeouts, and unexpected response shapes all yield
37    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
38    /// returns an error: at the executor level we want a partial result for
39    /// every site, not abort-on-first-failure semantics.
40    ///
41    /// When ban detection classifies a response as `rate_limited` /
42    /// `cloudflare_challenge`, the call is retried with jittered exponential
43    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
44    /// Uncertain (network errors, body read failures) is **not** retried —
45    /// those failures rarely fix themselves in the seconds-to-minutes window
46    /// we'd block for.
47    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
48    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
49        let mut attempt: u32 = 0;
50        loop {
51            let outcome = self.probe_once(site, username).await;
52            if !retry::should_retry(&outcome, attempt, &self.retry) {
53                return outcome;
54            }
55            let delay = retry::backoff_delay(attempt, &self.retry);
56            tracing::info!(
57                site = %site.name,
58                attempt = attempt + 1,
59                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
60                ?delay,
61                "transient ban, retrying",
62            );
63            tokio::time::sleep(delay).await;
64            attempt += 1;
65        }
66    }
67
68    /// Fetch a URL and return raw response data (status, final URL, body)
69    /// with the same throttle / User-Agent / proxy machinery as `check`,
70    /// but without signal evaluation or retry.
71    ///
72    /// Returns `None` on any network/transport error. Intended for
73    /// diagnostics such as `adler --doctor --fix`, which diffs the
74    /// responses for a known-present and a nonsense user to derive a
75    /// signature.
76    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
77        let host = host_of(url);
78        if let Some(global) = &self.global_throttle {
79            global.wait(GLOBAL_THROTTLE_KEY).await;
80        }
81        self.throttle.wait(&host).await;
82        let mut request = self.http.client().get(url);
83        if let Some(ua) = self.pick_user_agent() {
84            request = request.header(reqwest::header::USER_AGENT, ua);
85        }
86        let response = request.send().await.ok()?;
87        let status = response.status().as_u16();
88        let final_url = response.url().to_string();
89        let body = response.text().await.unwrap_or_default();
90        Some(RawResponse {
91            status,
92            final_url,
93            body,
94        })
95    }
96
97    /// Same as [`Self::fetch`] but routes through the configured browser
98    /// backend when the site is tagged `bot-protected` and a backend is
99    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
100    /// so that the diff-derivation works against the JS-rendered page
101    /// (login wall vs. real profile) rather than two identical raw-HTTP
102    /// shells.
103    ///
104    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
105    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
106    /// callers get the same `Option<RawResponse>` shape either way.
107    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
108        if let Some(backend) = self.browser.as_deref() {
109            let has_tag = site
110                .tags
111                .iter()
112                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
113            if has_tag || !site.protection.is_empty() {
114                let parsed = url::Url::parse(url).ok()?;
115                match backend
116                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
117                    .await
118                {
119                    Ok(page) => {
120                        return Some(RawResponse {
121                            status: page.status,
122                            final_url: page.final_url.to_string(),
123                            body: page.body,
124                        });
125                    }
126                    Err(err) => {
127                        tracing::warn!(
128                            site = %site.name, %url, error = %err,
129                            "browser fetch failed in doctor; falling back to raw HTTP",
130                        );
131                    }
132                }
133            }
134        }
135        self.fetch(url).await
136    }
137
138    /// Pick a User-Agent for the next request from the rotation pool, or
139    /// `None` to fall back on the client's fixed header.
140    fn pick_user_agent(&self) -> Option<&str> {
141        match self.user_agents.len() {
142            0 => None,
143            1 => Some(&self.user_agents[0]),
144            n => Some(&self.user_agents[fastrand::usize(0..n)]),
145        }
146    }
147
148    // Splitting probe_once into helpers would scatter the request/response
149    // flow that has to read top-to-bottom; one long function reads better.
150    #[allow(clippy::too_many_lines)]
151    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
152        let url = site.url_for(username);
153
154        // Site-level username constraint (Sherlock's `regexCheck`).
155        // Mismatch → skip the probe entirely. Saves a request and
156        // sidesteps the false-positive class where a site 404s on
157        // illegal usernames in a way our signal can't distinguish
158        // from a missing account. If the pattern fails to compile
159        // (Sherlock occasionally uses lookarounds, which our `regex`
160        // crate can't express), we let validate's warn-log stand
161        // and silently fall through — the rest of the probe still
162        // works.
163        if let Some(pat) = &site.regex_check {
164            if let Ok(re) = regex::Regex::new(pat) {
165                if !re.is_match(username.as_str()) {
166                    return uncertain(
167                        &site.name,
168                        url,
169                        Instant::now(),
170                        UncertainReason::UsernameNotAllowed,
171                    );
172                }
173            }
174        }
175
176        // Resolve an operator session if the site's access policy names
177        // one, and fold its headers (cookies / tokens) over the site's
178        // own. A named-but-missing session is reported rather than sent
179        // unauthenticated into a login wall — which reads identically
180        // for an existing and a missing account. Applies to both the
181        // HTTP and browser transports.
182        let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
183            None => Cow::Borrowed(&site.request_headers),
184            Some(name) => match self.sessions.get(name) {
185                Some(session) => Cow::Owned(session.apply(&site.request_headers)),
186                None => {
187                    return uncertain(
188                        &site.name,
189                        url,
190                        Instant::now(),
191                        UncertainReason::SessionRequired,
192                    );
193                }
194            },
195        };
196        let headers: &BTreeMap<String, String> = &session_headers;
197
198        // Auto-route bot-protected sites through the browser backend when
199        // one is configured. Raw HTTP can't see past their JS/login wall,
200        // so this is the only way they ever produce a Found verdict.
201        // A site is "bot-protected" in the routing sense if it carries
202        // the legacy tag OR declares any specific protection mechanism
203        // via the new `protection` field — either signal is enough.
204        if let Some(backend) = &self.browser {
205            let has_tag = site
206                .tags
207                .iter()
208                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
209            if has_tag || !site.protection.is_empty() {
210                if self.browser_budget.try_consume() {
211                    let started = Instant::now();
212                    let req = FetchRequest {
213                        method: site.request_method,
214                        url: &url,
215                        body: None,
216                        user_agent: None,
217                        headers,
218                        want_body: true,
219                    };
220                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
221                    let mut outcome = match fetcher.fetch(&req).await {
222                        Ok(resp) => self.finish(site, url, started, &resp),
223                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
224                    };
225                    outcome.transport = Some(crate::escalation::TransportTier::Browser);
226                    return outcome;
227                }
228                tracing::warn!(site = %site.name, "browser budget exhausted");
229                let mut outcome = uncertain(
230                    &site.name,
231                    url,
232                    Instant::now(),
233                    UncertainReason::BrowserBudget,
234                );
235                outcome.transport = Some(crate::escalation::TransportTier::Browser);
236                return outcome;
237            }
238        }
239
240        // Phase 2: route pure-`TlsFingerprint` sites through the
241        // impersonating transport — a real BoringSSL TLS handshake from
242        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
243        // protection tag, at a fraction of the cost of a real browser.
244        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
245        // keep going through the browser path above, where they were.
246        #[cfg(feature = "impersonate")]
247        if let Some(fetcher) = &self.impersonate {
248            let pure_tls = site.protection.len() == 1
249                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
250                && !site
251                    .tags
252                    .iter()
253                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
254            if pure_tls {
255                let started = Instant::now();
256                let req = FetchRequest {
257                    method: site.request_method,
258                    url: &url,
259                    body: None,
260                    user_agent: self.pick_user_agent(),
261                    headers,
262                    want_body: true,
263                };
264                let mut primary = match fetcher.fetch(&req).await {
265                    Ok(resp) => self.finish(site, url.clone(), started, &resp),
266                    Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
267                };
268                primary.transport = Some(crate::escalation::TransportTier::Impersonate);
269                return self.maybe_escalate(site, &url, headers, primary).await;
270            }
271        }
272
273        // Egress selection: route the HTTP path through a geo / IP-type
274        // matching proxy when the site's access policy demands one. An
275        // unconstrained policy uses the default egress; a constrained
276        // policy with no matching egress is reported `GeoUnavailable`
277        // rather than fetched from the wrong location (a false
278        // `NotFound` would be worse than an honest `Uncertain`).
279        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
280            EgressChoice::Default => Arc::clone(&self.http),
281            EgressChoice::Use(fetcher) => fetcher,
282            EgressChoice::Unavailable => {
283                return uncertain(
284                    &site.name,
285                    url,
286                    Instant::now(),
287                    UncertainReason::GeoUnavailable,
288                );
289            }
290        };
291
292        let host = host_of(&url);
293
294        // robots.txt gate, before consuming a throttle slot or probing.
295        if let Some(robots) = &self.robots {
296            if let Some((origin, path)) = origin_and_path(&url) {
297                if !robots.allowed(&origin, &path).await {
298                    tracing::debug!(%url, "skipped by robots.txt");
299                    return uncertain(
300                        &site.name,
301                        url,
302                        Instant::now(),
303                        UncertainReason::RobotsDisallowed,
304                    );
305                }
306            }
307        }
308
309        // Global cap first (gates every request), then per-host spacing.
310        if let Some(global) = &self.global_throttle {
311            global.wait(GLOBAL_THROTTLE_KEY).await;
312        }
313        self.throttle.wait(&host).await;
314        let started = Instant::now();
315        tracing::debug!(%url, %host, "probing");
316
317        // Read the body only if a signal needs it, or enrichment is on
318        // and the site declares extractor rules (extraction needs it).
319        let want_enrich = self.enrich && !site.extract.is_empty();
320        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
321
322        // POST sites carry their own body payload (the username goes in
323        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
324        // `{username}` in `Site::request_body` is substituted here,
325        // mirroring URL substitution.
326        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
327            const USERNAME_PH: &str = "{username}";
328            site.request_body
329                .as_deref()
330                .map(|t| t.replace(USERNAME_PH, username.as_str()))
331        } else {
332            None
333        };
334
335        let req = FetchRequest {
336            method: site.request_method,
337            url: &url,
338            body: body_for_post.as_deref(),
339            user_agent: self.pick_user_agent(),
340            headers,
341            want_body: needs_body,
342        };
343        let mut primary = match egress.fetch(&req).await {
344            Ok(resp) => self.finish(site, url.clone(), started, &resp),
345            Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
346        };
347        primary.transport = Some(crate::escalation::TransportTier::Http);
348        self.maybe_escalate(site, &url, headers, primary).await
349    }
350
351    /// If the cheap transport returned an `Uncertain` reason a browser
352    /// fetch could plausibly resolve, retry through the browser backend
353    /// and stamp the new outcome as escalated. Bounded by
354    /// [`escalation_budget`](ClientBuilder::escalation_budget).
355    async fn maybe_escalate(
356        &self,
357        site: &Site,
358        url: &str,
359        headers: &BTreeMap<String, String>,
360        primary: CheckOutcome,
361    ) -> CheckOutcome {
362        if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
363            return primary;
364        }
365        let Some(reason) = &primary.reason else {
366            return primary;
367        };
368        if !crate::escalation::should_escalate(reason) {
369            return primary;
370        }
371        let Some(backend) = &self.browser else {
372            return primary;
373        };
374        if !self.escalation_budget.try_consume() {
375            tracing::debug!(site = %site.name, "escalation budget exhausted");
376            return primary;
377        }
378
379        tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
380        let started = Instant::now();
381        let req = FetchRequest {
382            method: site.request_method,
383            url,
384            body: None,
385            user_agent: None,
386            headers,
387            want_body: true,
388        };
389        let fetcher = BrowserFetcher::new(Arc::clone(backend));
390        let mut escalated = match fetcher.fetch(&req).await {
391            Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
392            Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
393        };
394        escalated.transport = Some(crate::escalation::TransportTier::Browser);
395        escalated.escalations = 1;
396        escalated
397    }
398
399    /// Evaluate a fetched response against the site's signals and build
400    /// the outcome. Shared by the HTTP and browser transports so the
401    /// verdict / evidence / enrichment logic lives in exactly one place.
402    fn finish(
403        &self,
404        site: &Site,
405        url: String,
406        started: Instant,
407        resp: &crate::transport::FetchResponse,
408    ) -> CheckOutcome {
409        let probe = Probe {
410            status: resp.status,
411            final_url: &resp.final_url,
412            body: &resp.body,
413        };
414        let votes: Vec<(&Signal, SignalVerdict)> = site
415            .signals
416            .iter()
417            .map(|s| (s, s.evaluate(&probe)))
418            .collect();
419        let kind = aggregate(votes.iter().map(|(_, v)| *v));
420        let mut result = outcome(&site.name, url, started, kind);
421        // Record which signals produced the verdict (the winning polarity).
422        let winning = match kind {
423            MatchKind::Found => Some(SignalVerdict::Found),
424            MatchKind::NotFound => Some(SignalVerdict::NotFound),
425            MatchKind::Uncertain => None,
426        };
427        if let Some(want) = winning {
428            result.evidence = votes
429                .iter()
430                .filter(|(_, v)| *v == want)
431                .map(|(s, _)| s.describe_match(&probe))
432                .collect();
433        }
434        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
435            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
436        }
437        result
438    }
439}