Skip to main content

adler_core/client/
probe.rs

1//! Per-site probe path: routing, ban-retry, escalation, finish.
2//!
3//! Hosts the methods on [`Client`] that turn one `(site, username)`
4//! pair into a [`CheckOutcome`]: the public entry point
5//! [`Client::check`], the request-issuing path [`Client::probe_once`]
6//! (browser routing → impersonate-fingerprint → egress selection →
7//! HTTP fetch), the HTTP→browser escalation in [`Client::maybe_escalate`]
8//! when a cheap-path response merits a second look, and the
9//! signal-evaluation [`Client::finish`] that turns a raw response
10//! into a final outcome. Also hosts the two diagnostic fetch helpers
11//! (`fetch`, `fetch_for_doctor`) used by `adler --doctor --fix`.
12//!
13//! Construction lives in `client::builder`; accessors and
14//! [`Client::with_egress_subset`] stay in `client::mod`.
15
16use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::Instant;
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::retry;
24use crate::site::{HttpMethod, Probe, ProtectionKind, Signal, SignalVerdict, Site, aggregate};
25use crate::transport::{
26    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
27};
28use crate::username::Username;
29
30use super::util::{host_of, origin_and_path, outcome, uncertain};
31use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
32
33fn routes_through_browser(site: &Site) -> bool {
34    site.tags
35        .iter()
36        .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
37        || site
38            .protection
39            .iter()
40            .any(|p| !matches!(p, ProtectionKind::UserAuth))
41}
42
43impl Client {
44    /// Probe a single site for `username`, retrying on transient bans.
45    ///
46    /// Network failures, timeouts, and unexpected response shapes all yield
47    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
48    /// returns an error: at the executor level we want a partial result for
49    /// every site, not abort-on-first-failure semantics.
50    ///
51    /// When ban detection classifies a response as `rate_limited` /
52    /// `cloudflare_challenge`, the call is retried with jittered exponential
53    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
54    /// Uncertain (network errors, body read failures) is **not** retried —
55    /// those failures rarely fix themselves in the seconds-to-minutes window
56    /// we'd block for.
57    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
58    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
59        let mut attempt: u32 = 0;
60        loop {
61            let outcome = self.probe_once(site, username).await;
62            if !retry::should_retry(&outcome, attempt, &self.retry) {
63                return outcome;
64            }
65            let delay = retry::backoff_delay(attempt, &self.retry);
66            tracing::info!(
67                site = %site.name,
68                attempt = attempt + 1,
69                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
70                ?delay,
71                "transient ban, retrying",
72            );
73            tokio::time::sleep(delay).await;
74            attempt += 1;
75        }
76    }
77
78    /// Fetch a URL and return raw response data (status, final URL, body)
79    /// with the same throttle / User-Agent / proxy machinery as `check`,
80    /// but without signal evaluation or retry.
81    ///
82    /// Returns `None` on any network/transport error. Intended for
83    /// diagnostics such as `adler --doctor --fix`, which diffs the
84    /// responses for a known-present and a nonsense user to derive a
85    /// signature.
86    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
87        let host = host_of(url);
88        if let Some(global) = &self.global_throttle {
89            global.wait(GLOBAL_THROTTLE_KEY).await;
90        }
91        self.throttle.wait(&host).await;
92        let mut request = self.http.client().get(url);
93        if let Some(ua) = self.pick_user_agent() {
94            request = request.header(reqwest::header::USER_AGENT, ua);
95        }
96        let response = request.send().await.ok()?;
97        let status = response.status().as_u16();
98        let final_url = response.url().to_string();
99        let body = response.text().await.unwrap_or_default();
100        Some(RawResponse {
101            status,
102            final_url,
103            body,
104        })
105    }
106
107    /// Same as [`Self::fetch`] but routes through the configured browser
108    /// backend when the site is tagged `bot-protected` and a backend is
109    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
110    /// so that the diff-derivation works against the JS-rendered page
111    /// (login wall vs. real profile) rather than two identical raw-HTTP
112    /// shells.
113    ///
114    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
115    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
116    /// callers get the same `Option<RawResponse>` shape either way.
117    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
118        if let Some(backend) = self.browser.as_deref() {
119            if routes_through_browser(site) {
120                let parsed = url::Url::parse(url).ok()?;
121                match backend
122                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
123                    .await
124                {
125                    Ok(page) => {
126                        return Some(RawResponse {
127                            status: page.status,
128                            final_url: page.final_url.to_string(),
129                            body: page.body,
130                        });
131                    }
132                    Err(err) => {
133                        tracing::warn!(
134                            site = %site.name, %url, error = %err,
135                            "browser fetch failed in doctor; falling back to raw HTTP",
136                        );
137                    }
138                }
139            }
140        }
141        self.fetch(url).await
142    }
143
144    /// Pick a User-Agent for the next request from the rotation pool, or
145    /// `None` to fall back on the client's fixed header.
146    fn pick_user_agent(&self) -> Option<&str> {
147        match self.user_agents.len() {
148            0 => None,
149            1 => Some(&self.user_agents[0]),
150            n => Some(&self.user_agents[fastrand::usize(0..n)]),
151        }
152    }
153
154    // Splitting probe_once into helpers would scatter the request/response
155    // flow that has to read top-to-bottom; one long function reads better.
156    #[allow(clippy::too_many_lines)]
157    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
158        let url = site.url_for(username);
159
160        // Site-level username constraint (Sherlock's `regexCheck`).
161        // Mismatch → skip the probe entirely. Saves a request and
162        // sidesteps the false-positive class where a site 404s on
163        // illegal usernames in a way our signal can't distinguish
164        // from a missing account. If the pattern fails to compile
165        // (Sherlock occasionally uses lookarounds, which our `regex`
166        // crate can't express), we let validate's warn-log stand
167        // and silently fall through — the rest of the probe still
168        // works.
169        if let Some(pat) = &site.regex_check {
170            if let Ok(re) = regex::Regex::new(pat) {
171                if !re.is_match(username.as_str()) {
172                    return uncertain(
173                        &site.name,
174                        url,
175                        Instant::now(),
176                        UncertainReason::UsernameNotAllowed,
177                    );
178                }
179            }
180        }
181
182        // Resolve an operator session if the site's access policy names
183        // one, and fold its headers (cookies / tokens) over the site's
184        // own. A named-but-missing session is reported rather than sent
185        // unauthenticated into a login wall — which reads identically
186        // for an existing and a missing account. Applies to both the
187        // HTTP and browser transports.
188        let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
189            None => Cow::Borrowed(&site.request_headers),
190            Some(name) => match self.sessions.get(name) {
191                Some(session) => Cow::Owned(session.apply(&site.request_headers)),
192                None => {
193                    return uncertain(
194                        &site.name,
195                        url,
196                        Instant::now(),
197                        UncertainReason::SessionRequired,
198                    );
199                }
200            },
201        };
202        let headers: &BTreeMap<String, String> = &session_headers;
203
204        // Auto-route bot-protected sites through the browser backend when
205        // one is configured. Raw HTTP can't see past their JS/login wall,
206        // so this is the only way they ever produce a Found verdict.
207        // A site is "bot-protected" in the routing sense if it carries
208        // the legacy tag OR declares any specific protection mechanism
209        // via the new `protection` field — either signal is enough.
210        if let Some(backend) = &self.browser {
211            if routes_through_browser(site) {
212                if self.browser_budget.try_consume() {
213                    let started = Instant::now();
214                    let req = FetchRequest {
215                        method: site.request_method,
216                        url: &url,
217                        body: None,
218                        user_agent: None,
219                        headers,
220                        want_body: true,
221                    };
222                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
223                    let mut outcome = match fetcher.fetch(&req).await {
224                        Ok(resp) => self.finish(site, url, started, &resp),
225                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
226                    };
227                    outcome.transport = Some(crate::escalation::TransportTier::Browser);
228                    return outcome;
229                }
230                tracing::warn!(site = %site.name, "browser budget exhausted");
231                let mut outcome = uncertain(
232                    &site.name,
233                    url,
234                    Instant::now(),
235                    UncertainReason::BrowserBudget,
236                );
237                outcome.transport = Some(crate::escalation::TransportTier::Browser);
238                return outcome;
239            }
240        }
241
242        // Phase 2: route pure-`TlsFingerprint` sites through the
243        // impersonating transport — a real BoringSSL TLS handshake from
244        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
245        // protection tag, at a fraction of the cost of a real browser.
246        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
247        // keep going through the browser path above, where they were.
248        #[cfg(feature = "impersonate")]
249        if let Some(fetcher) = &self.impersonate {
250            let pure_tls = site.protection.len() == 1
251                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
252                && !site
253                    .tags
254                    .iter()
255                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
256            if pure_tls {
257                let started = Instant::now();
258                let req = FetchRequest {
259                    method: site.request_method,
260                    url: &url,
261                    body: None,
262                    user_agent: self.pick_user_agent(),
263                    headers,
264                    want_body: true,
265                };
266                let mut primary = match fetcher.fetch(&req).await {
267                    Ok(resp) => self.finish(site, url.clone(), started, &resp),
268                    Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
269                };
270                primary.transport = Some(crate::escalation::TransportTier::Impersonate);
271                return self.maybe_escalate(site, &url, headers, primary).await;
272            }
273        }
274
275        // Egress selection: route the HTTP path through a geo / IP-type
276        // matching proxy when the site's access policy demands one. An
277        // unconstrained policy uses the default egress; a constrained
278        // policy with no matching egress is reported `GeoUnavailable`
279        // rather than fetched from the wrong location (a false
280        // `NotFound` would be worse than an honest `Uncertain`).
281        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
282            EgressChoice::Default => Arc::clone(&self.http),
283            EgressChoice::Use(fetcher) => fetcher,
284            EgressChoice::Unavailable => {
285                return uncertain(
286                    &site.name,
287                    url,
288                    Instant::now(),
289                    UncertainReason::GeoUnavailable,
290                );
291            }
292        };
293
294        let host = host_of(&url);
295
296        // robots.txt gate, before consuming a throttle slot or probing.
297        if let Some(robots) = &self.robots {
298            if let Some((origin, path)) = origin_and_path(&url) {
299                if !robots.allowed(&origin, &path).await {
300                    tracing::debug!(%url, "skipped by robots.txt");
301                    return uncertain(
302                        &site.name,
303                        url,
304                        Instant::now(),
305                        UncertainReason::RobotsDisallowed,
306                    );
307                }
308            }
309        }
310
311        // Global cap first (gates every request), then per-host spacing.
312        if let Some(global) = &self.global_throttle {
313            global.wait(GLOBAL_THROTTLE_KEY).await;
314        }
315        self.throttle.wait(&host).await;
316        let started = Instant::now();
317        tracing::debug!(%url, %host, "probing");
318
319        // Read the body only if a signal needs it, or enrichment is on
320        // and the site declares extractor rules (extraction needs it).
321        let want_enrich = self.enrich && !site.extract.is_empty();
322        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
323
324        // POST sites carry their own body payload (the username goes in
325        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
326        // `{username}` in `Site::request_body` is substituted here,
327        // mirroring URL substitution.
328        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
329            const USERNAME_PH: &str = "{username}";
330            site.request_body
331                .as_deref()
332                .map(|t| t.replace(USERNAME_PH, username.as_str()))
333        } else {
334            None
335        };
336
337        let req = FetchRequest {
338            method: site.request_method,
339            url: &url,
340            body: body_for_post.as_deref(),
341            user_agent: self.pick_user_agent(),
342            headers,
343            want_body: needs_body,
344        };
345        let mut primary = match egress.fetch(&req).await {
346            Ok(resp) => self.finish(site, url.clone(), started, &resp),
347            Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
348        };
349        primary.transport = Some(crate::escalation::TransportTier::Http);
350        self.maybe_escalate(site, &url, headers, primary).await
351    }
352
353    /// If the cheap transport returned an `Uncertain` reason a browser
354    /// fetch could plausibly resolve, retry through the browser backend
355    /// and stamp the new outcome as escalated. Bounded by
356    /// [`escalation_budget`](ClientBuilder::escalation_budget).
357    async fn maybe_escalate(
358        &self,
359        site: &Site,
360        url: &str,
361        headers: &BTreeMap<String, String>,
362        primary: CheckOutcome,
363    ) -> CheckOutcome {
364        if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
365            return primary;
366        }
367        let Some(reason) = &primary.reason else {
368            return primary;
369        };
370        if !crate::escalation::should_escalate(reason) {
371            return primary;
372        }
373        let Some(backend) = &self.browser else {
374            return primary;
375        };
376        if !self.escalation_budget.try_consume() {
377            tracing::debug!(site = %site.name, "escalation budget exhausted");
378            return primary;
379        }
380
381        tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
382        let started = Instant::now();
383        let req = FetchRequest {
384            method: site.request_method,
385            url,
386            body: None,
387            user_agent: None,
388            headers,
389            want_body: true,
390        };
391        let fetcher = BrowserFetcher::new(Arc::clone(backend));
392        let mut escalated = match fetcher.fetch(&req).await {
393            Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
394            Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
395        };
396        escalated.transport = Some(crate::escalation::TransportTier::Browser);
397        escalated.escalations = 1;
398        escalated
399    }
400
401    /// Evaluate a fetched response against the site's signals and build
402    /// the outcome. Shared by the HTTP and browser transports so the
403    /// verdict / evidence / enrichment logic lives in exactly one place.
404    fn finish(
405        &self,
406        site: &Site,
407        url: String,
408        started: Instant,
409        resp: &crate::transport::FetchResponse,
410    ) -> CheckOutcome {
411        let probe = Probe {
412            status: resp.status,
413            final_url: &resp.final_url,
414            body: &resp.body,
415        };
416        let votes: Vec<(&Signal, SignalVerdict)> = site
417            .signals
418            .iter()
419            .map(|s| (s, s.evaluate(&probe)))
420            .collect();
421        let kind = aggregate(votes.iter().map(|(_, v)| *v));
422        let mut result = outcome(&site.name, url, started, kind);
423        // Record which signals produced the verdict (the winning polarity).
424        let winning = match kind {
425            MatchKind::Found => Some(SignalVerdict::Found),
426            MatchKind::NotFound => Some(SignalVerdict::NotFound),
427            MatchKind::Uncertain => None,
428        };
429        if let Some(want) = winning {
430            result.evidence = votes
431                .iter()
432                .filter(|(_, v)| *v == want)
433                .map(|(s, _)| s.describe_match(&probe))
434                .collect();
435        }
436        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
437            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
438        }
439        result
440    }
441}