Skip to main content

adler_core/client/
probe.rs

1//! Per-site probe path: routing, ban-retry, escalation, finish.
2//!
3//! Hosts the methods on [`Client`] that turn one `(site, username)`
4//! pair into a [`CheckOutcome`]: the public entry point
5//! [`Client::check`], the request-issuing path [`Client::probe_once`]
6//! (browser routing → impersonate-fingerprint → egress selection →
7//! HTTP fetch), the HTTP→browser escalation in [`Client::maybe_escalate`]
8//! when a cheap-path response merits a second look, and the
9//! signal-evaluation [`Client::finish`] that turns a raw response
10//! into a final outcome. Also hosts the two diagnostic fetch helpers
11//! (`fetch`, `fetch_for_doctor`) used by `adler --doctor --fix`.
12//!
13//! Construction lives in `client::builder`; accessors and
14//! [`Client::with_egress_subset`] stay in `client::mod`.
15
16use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::{Instant, SystemTime, UNIX_EPOCH};
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::escalation::TransportTier;
24use crate::retry;
25use crate::site::{HttpMethod, Probe, ProtectionKind, Signal, SignalVerdict, Site, aggregate};
26use crate::transport::{
27    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
28};
29use crate::username::Username;
30
31use super::util::{host_of, origin_and_path, outcome, uncertain};
32use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
33
34fn routes_through_browser(site: &Site) -> bool {
35    site.tags
36        .iter()
37        .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
38        || site
39            .protection
40            .iter()
41            .any(|p| !matches!(p, ProtectionKind::UserAuth))
42}
43
44#[derive(Debug, Clone, Copy)]
45struct ProbeEvidenceContext {
46    transport: TransportTier,
47    escalations: u8,
48    authenticated: bool,
49}
50
51impl Client {
52    /// Probe a single site for `username`, retrying on transient bans.
53    ///
54    /// Network failures, timeouts, and unexpected response shapes all yield
55    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
56    /// returns an error: at the executor level we want a partial result for
57    /// every site, not abort-on-first-failure semantics.
58    ///
59    /// When ban detection classifies a response as `rate_limited` /
60    /// `cloudflare_challenge`, the call is retried with jittered exponential
61    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
62    /// Uncertain (network errors, body read failures) is **not** retried —
63    /// those failures rarely fix themselves in the seconds-to-minutes window
64    /// we'd block for.
65    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
66    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
67        let mut attempt: u32 = 0;
68        loop {
69            let outcome = self.probe_once(site, username).await;
70            if !retry::should_retry(&outcome, attempt, &self.retry) {
71                return outcome;
72            }
73            let delay = retry::backoff_delay(attempt, &self.retry);
74            tracing::info!(
75                site = %site.name,
76                attempt = attempt + 1,
77                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
78                ?delay,
79                "transient ban, retrying",
80            );
81            tokio::time::sleep(delay).await;
82            attempt += 1;
83        }
84    }
85
86    /// Fetch a URL and return raw response data (status, final URL, body)
87    /// with the same throttle / User-Agent / proxy machinery as `check`,
88    /// but without signal evaluation or retry.
89    ///
90    /// Returns `None` on any network/transport error. Intended for
91    /// diagnostics such as `adler --doctor --fix`, which diffs the
92    /// responses for a known-present and a nonsense user to derive a
93    /// signature.
94    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
95        let host = host_of(url);
96        if let Some(global) = &self.global_throttle {
97            global.wait(GLOBAL_THROTTLE_KEY).await;
98        }
99        self.throttle.wait(&host).await;
100        let mut request = self.http.client().get(url);
101        if let Some(ua) = self.pick_user_agent() {
102            request = request.header(reqwest::header::USER_AGENT, ua);
103        }
104        let response = request.send().await.ok()?;
105        let status = response.status().as_u16();
106        let final_url = response.url().to_string();
107        let body = response.text().await.unwrap_or_default();
108        Some(RawResponse {
109            status,
110            final_url,
111            body,
112        })
113    }
114
115    /// Same as [`Self::fetch`] but routes through the configured browser
116    /// backend when the site is tagged `bot-protected` and a backend is
117    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
118    /// so that the diff-derivation works against the JS-rendered page
119    /// (login wall vs. real profile) rather than two identical raw-HTTP
120    /// shells.
121    ///
122    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
123    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
124    /// callers get the same `Option<RawResponse>` shape either way.
125    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
126        if let Some(backend) = self.browser.as_deref() {
127            if routes_through_browser(site) {
128                let parsed = url::Url::parse(url).ok()?;
129                match backend
130                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
131                    .await
132                {
133                    Ok(page) => {
134                        return Some(RawResponse {
135                            status: page.status,
136                            final_url: page.final_url.to_string(),
137                            body: page.body,
138                        });
139                    }
140                    Err(err) => {
141                        tracing::warn!(
142                            site = %site.name, %url, error = %err,
143                            "browser fetch failed in doctor; falling back to raw HTTP",
144                        );
145                    }
146                }
147            }
148        }
149        self.fetch(url).await
150    }
151
152    /// Pick a User-Agent for the next request from the rotation pool, or
153    /// `None` to fall back on the client's fixed header.
154    fn pick_user_agent(&self) -> Option<&str> {
155        match self.user_agents.len() {
156            0 => None,
157            1 => Some(&self.user_agents[0]),
158            n => Some(&self.user_agents[fastrand::usize(0..n)]),
159        }
160    }
161
162    // Splitting probe_once into helpers would scatter the request/response
163    // flow that has to read top-to-bottom; one long function reads better.
164    #[allow(clippy::too_many_lines)]
165    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
166        let url = site.url_for(username);
167
168        // Site-level username constraint (Sherlock's `regexCheck`).
169        // Mismatch → skip the probe entirely. Saves a request and
170        // sidesteps the false-positive class where a site 404s on
171        // illegal usernames in a way our signal can't distinguish
172        // from a missing account. If the pattern fails to compile
173        // (Sherlock occasionally uses lookarounds, which our `regex`
174        // crate can't express), we let validate's warn-log stand
175        // and silently fall through — the rest of the probe still
176        // works.
177        if let Some(pat) = &site.regex_check {
178            if let Ok(re) = regex::Regex::new(pat) {
179                if !re.is_match(username.as_str()) {
180                    return uncertain(
181                        &site.name,
182                        url,
183                        Instant::now(),
184                        UncertainReason::UsernameNotAllowed,
185                    );
186                }
187            }
188        }
189
190        // Resolve an operator session if the site's access policy names
191        // one, and fold its headers (cookies / tokens) over the site's
192        // own. A named-but-missing session is reported rather than sent
193        // unauthenticated into a login wall — which reads identically
194        // for an existing and a missing account. Applies to both the
195        // HTTP and browser transports.
196        let (session_headers, authenticated): (Cow<'_, BTreeMap<String, String>>, bool) =
197            match &site.access.session {
198                None => (Cow::Borrowed(&site.request_headers), false),
199                Some(name) => match self.sessions.get(name) {
200                    Some(session) => (Cow::Owned(session.apply(&site.request_headers)), true),
201                    None => {
202                        return uncertain(
203                            &site.name,
204                            url,
205                            Instant::now(),
206                            UncertainReason::SessionRequired,
207                        );
208                    }
209                },
210            };
211        let headers: &BTreeMap<String, String> = &session_headers;
212
213        // Auto-route bot-protected sites through the browser backend when
214        // one is configured. Raw HTTP can't see past their JS/login wall,
215        // so this is the only way they ever produce a Found verdict.
216        // A site is "bot-protected" in the routing sense if it carries
217        // the legacy tag OR declares any specific protection mechanism
218        // via the new `protection` field — either signal is enough.
219        if let Some(backend) = &self.browser {
220            if routes_through_browser(site) {
221                if self.browser_budget.try_consume() {
222                    let started = Instant::now();
223                    let req = FetchRequest {
224                        method: site.request_method,
225                        url: &url,
226                        body: None,
227                        user_agent: None,
228                        headers,
229                        want_body: true,
230                    };
231                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
232                    let mut outcome = match fetcher.fetch(&req).await {
233                        Ok(resp) => self.finish(
234                            site,
235                            username,
236                            url,
237                            started,
238                            &resp,
239                            ProbeEvidenceContext {
240                                transport: TransportTier::Browser,
241                                escalations: 0,
242                                authenticated,
243                            },
244                        ),
245                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
246                    };
247                    outcome.transport = Some(TransportTier::Browser);
248                    return outcome;
249                }
250                tracing::warn!(site = %site.name, "browser budget exhausted");
251                let mut outcome = uncertain(
252                    &site.name,
253                    url,
254                    Instant::now(),
255                    UncertainReason::BrowserBudget,
256                );
257                outcome.transport = Some(TransportTier::Browser);
258                return outcome;
259            }
260        }
261
262        // Phase 2: route pure-`TlsFingerprint` sites through the
263        // impersonating transport — a real BoringSSL TLS handshake from
264        // `wreq` matches Chrome's JA3/JA4 fingerprint that triggered the
265        // protection tag, at a fraction of the cost of a real browser.
266        // Mixed-protection sites (TLS-fingerprint + Cloudflare, etc.)
267        // keep going through the browser path above, where they were.
268        #[cfg(feature = "impersonate")]
269        if let Some(fetcher) = &self.impersonate {
270            let pure_tls = site.protection.len() == 1
271                && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
272                && !site
273                    .tags
274                    .iter()
275                    .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
276            if pure_tls {
277                let started = Instant::now();
278                let req = FetchRequest {
279                    method: site.request_method,
280                    url: &url,
281                    body: None,
282                    user_agent: self.pick_user_agent(),
283                    headers,
284                    want_body: true,
285                };
286                let mut primary = match fetcher.fetch(&req).await {
287                    Ok(resp) => self.finish(
288                        site,
289                        username,
290                        url.clone(),
291                        started,
292                        &resp,
293                        ProbeEvidenceContext {
294                            transport: TransportTier::Impersonate,
295                            escalations: 0,
296                            authenticated,
297                        },
298                    ),
299                    Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
300                };
301                primary.transport = Some(TransportTier::Impersonate);
302                return self
303                    .maybe_escalate(site, username, &url, headers, authenticated, primary)
304                    .await;
305            }
306        }
307
308        // Egress selection: route the HTTP path through a geo / IP-type
309        // matching proxy when the site's access policy demands one. An
310        // unconstrained policy uses the default egress; a constrained
311        // policy with no matching egress is reported `GeoUnavailable`
312        // rather than fetched from the wrong location (a false
313        // `NotFound` would be worse than an honest `Uncertain`).
314        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
315            EgressChoice::Default => Arc::clone(&self.http),
316            EgressChoice::Use(fetcher) => fetcher,
317            EgressChoice::Unavailable => {
318                return uncertain(
319                    &site.name,
320                    url,
321                    Instant::now(),
322                    UncertainReason::GeoUnavailable,
323                );
324            }
325        };
326
327        let host = host_of(&url);
328
329        // robots.txt gate, before consuming a throttle slot or probing.
330        if let Some(robots) = &self.robots {
331            if let Some((origin, path)) = origin_and_path(&url) {
332                if !robots.allowed(&origin, &path).await {
333                    tracing::debug!(%url, "skipped by robots.txt");
334                    return uncertain(
335                        &site.name,
336                        url,
337                        Instant::now(),
338                        UncertainReason::RobotsDisallowed,
339                    );
340                }
341            }
342        }
343
344        // Global cap first (gates every request), then per-host spacing.
345        if let Some(global) = &self.global_throttle {
346            global.wait(GLOBAL_THROTTLE_KEY).await;
347        }
348        self.throttle.wait(&host).await;
349        let started = Instant::now();
350        tracing::debug!(%url, %host, "probing");
351
352        // Read the body only if a signal needs it, or enrichment is on
353        // and the site declares extractor rules (extraction needs it).
354        let want_enrich = self.enrich && !site.extract.is_empty();
355        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
356
357        // POST sites carry their own body payload (the username goes in
358        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
359        // `{username}` in `Site::request_body` is substituted here,
360        // mirroring URL substitution.
361        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
362            const USERNAME_PH: &str = "{username}";
363            site.request_body
364                .as_deref()
365                .map(|t| t.replace(USERNAME_PH, username.as_str()))
366        } else {
367            None
368        };
369
370        let req = FetchRequest {
371            method: site.request_method,
372            url: &url,
373            body: body_for_post.as_deref(),
374            user_agent: self.pick_user_agent(),
375            headers,
376            want_body: needs_body,
377        };
378        let mut primary = match egress.fetch(&req).await {
379            Ok(resp) => self.finish(
380                site,
381                username,
382                url.clone(),
383                started,
384                &resp,
385                ProbeEvidenceContext {
386                    transport: TransportTier::Http,
387                    escalations: 0,
388                    authenticated,
389                },
390            ),
391            Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
392        };
393        primary.transport = Some(TransportTier::Http);
394        self.maybe_escalate(site, username, &url, headers, authenticated, primary)
395            .await
396    }
397
398    /// If the cheap transport returned an `Uncertain` reason a browser
399    /// fetch could plausibly resolve, retry through the browser backend
400    /// and stamp the new outcome as escalated. Bounded by
401    /// [`escalation_budget`](ClientBuilder::escalation_budget).
402    async fn maybe_escalate(
403        &self,
404        site: &Site,
405        username: &Username,
406        url: &str,
407        headers: &BTreeMap<String, String>,
408        authenticated: bool,
409        primary: CheckOutcome,
410    ) -> CheckOutcome {
411        if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
412            return primary;
413        }
414        let Some(reason) = &primary.reason else {
415            return primary;
416        };
417        if !crate::escalation::should_escalate(reason) {
418            return primary;
419        }
420        let Some(backend) = &self.browser else {
421            return primary;
422        };
423        if !self.escalation_budget.try_consume() {
424            tracing::debug!(site = %site.name, "escalation budget exhausted");
425            return primary;
426        }
427
428        tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
429        let started = Instant::now();
430        let req = FetchRequest {
431            method: site.request_method,
432            url,
433            body: None,
434            user_agent: None,
435            headers,
436            want_body: true,
437        };
438        let fetcher = BrowserFetcher::new(Arc::clone(backend));
439        let mut escalated = match fetcher.fetch(&req).await {
440            Ok(resp) => self.finish(
441                site,
442                username,
443                url.to_owned(),
444                started,
445                &resp,
446                ProbeEvidenceContext {
447                    transport: TransportTier::Browser,
448                    escalations: 1,
449                    authenticated,
450                },
451            ),
452            Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
453        };
454        escalated.transport = Some(TransportTier::Browser);
455        escalated.escalations = 1;
456        escalated
457    }
458
459    /// Evaluate a fetched response against the site's signals and build
460    /// the outcome. Shared by the HTTP and browser transports so the
461    /// verdict / evidence / enrichment logic lives in exactly one place.
462    fn finish(
463        &self,
464        site: &Site,
465        username: &Username,
466        url: String,
467        started: Instant,
468        resp: &crate::transport::FetchResponse,
469        context: ProbeEvidenceContext,
470    ) -> CheckOutcome {
471        let canonical_username = site.canonical_username(username);
472        let probe = Probe {
473            status: resp.status,
474            final_url: &resp.final_url,
475            body: &resp.body,
476            username: &canonical_username,
477        };
478        let votes: Vec<(&Signal, SignalVerdict)> = site
479            .signals
480            .iter()
481            .map(|s| (s, s.evaluate(&probe)))
482            .collect();
483        let kind = aggregate(votes.iter().map(|(_, v)| *v));
484        let mut result = outcome(&site.name, url, started, kind);
485        result.transport = Some(context.transport);
486        result.escalations = context.escalations;
487        // Record which signals produced the verdict (the winning polarity).
488        let winning = match kind {
489            MatchKind::Found => Some(SignalVerdict::Found),
490            MatchKind::NotFound => Some(SignalVerdict::NotFound),
491            MatchKind::Uncertain => None,
492        };
493        if let Some(want) = winning {
494            result.evidence = votes
495                .iter()
496                .filter(|(_, v)| *v == want)
497                .map(|(s, _)| s.describe_match(&probe))
498                .collect();
499        }
500        let username_confirmed = kind == MatchKind::Found
501            && votes
502                .iter()
503                .any(|(s, v)| *v == SignalVerdict::Found && s.confirms_username());
504        if username_confirmed {
505            let observed_at_ms = unix_epoch_ms();
506            let access_path = crate::EvidenceAccessPath::new(
507                context.transport,
508                context.escalations,
509                context.authenticated,
510            );
511            result
512                .profile_evidence
513                .push(crate::ProfileEvidence::from_signal_username(
514                    &result.site,
515                    &result.url,
516                    &canonical_username,
517                    Some(observed_at_ms),
518                    Some(access_path),
519                ));
520        }
521        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
522            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
523            let observed_at_ms = unix_epoch_ms();
524            let access_path = crate::EvidenceAccessPath::new(
525                context.transport,
526                context.escalations,
527                context.authenticated,
528            );
529            result.profile_evidence = result
530                .enrichment
531                .iter()
532                .map(|(field, value)| {
533                    crate::ProfileEvidence::from_enrichment_with_source(
534                        &result.site,
535                        &result.url,
536                        field,
537                        value,
538                        Some(observed_at_ms),
539                        Some(access_path.clone()),
540                    )
541                })
542                .collect();
543        }
544        result.refresh_confidence();
545        result
546    }
547}
548
549fn unix_epoch_ms() -> u64 {
550    SystemTime::now()
551        .duration_since(UNIX_EPOCH)
552        .ok()
553        .and_then(|duration| u64::try_from(duration.as_millis()).ok())
554        .unwrap_or(u64::MAX)
555}