Skip to main content

adler_core/
client.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use reqwest::redirect;
16
17use crate::access::{EgressChoice, EgressPool, EgressSpec};
18use crate::browser::{BrowserBackend, BrowserBudget};
19use crate::check::{CheckOutcome, MatchKind, UncertainReason};
20use crate::error::{Error, Result};
21use crate::retry::{self, RetryPolicy};
22use crate::robots::RobotsCache;
23use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
24use crate::throttle::HostThrottle;
25use crate::transport::{
26    BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
27};
28use crate::username::Username;
29
30const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
31const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
32const DEFAULT_REDIRECT_LIMIT: usize = 8;
33const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
34/// Single fixed key for the global rate limiter (it gates all hosts).
35const GLOBAL_THROTTLE_KEY: &str = "*global*";
36
37/// HTTP client used to probe sites.
38///
39/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
40/// internally, and the throttle is `Arc`-backed, so cloning is the
41/// recommended way to share a client between tasks. Cloned clients share
42/// throttle state, which is what you want: a fan-out scan must not
43/// accidentally exceed a per-host budget by spawning more clients.
44#[derive(Clone)]
45pub struct Client {
46    http: Arc<HttpFetcher>,
47    /// Geo / IP-type egress pool for sites whose `access` policy needs a
48    /// specific proxy. Empty by default → every site uses `http`.
49    egress: Arc<EgressPool>,
50    throttle: HostThrottle,
51    /// Global RPS cap applied across all hosts. `None` → uncapped.
52    global_throttle: Option<HostThrottle>,
53    retry: RetryPolicy,
54    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
55    /// `Arc<[String]>` so cloning a client per task stays cheap.
56    user_agents: Arc<[String]>,
57    /// Extract profile fields from `Found` pages that declare extractors.
58    enrich: bool,
59    /// When set, skip probes disallowed by the host's `robots.txt`.
60    robots: Option<RobotsCache>,
61    /// Browser backend used for `bot-protected` sites. `None` → those sites
62    /// stay on the raw HTTP path and typically end up `Uncertain`.
63    browser: Option<Arc<dyn BrowserBackend>>,
64    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
65    /// for a single scan, so several tasks compete for the same budget.
66    browser_budget: Arc<BrowserBudget>,
67}
68
69impl Client {
70    /// Start configuring a new client.
71    pub fn builder() -> ClientBuilder {
72        ClientBuilder::default()
73    }
74
75    /// Probe a single site for `username`, retrying on transient bans.
76    ///
77    /// Network failures, timeouts, and unexpected response shapes all yield
78    /// [`MatchKind::Uncertain`] with a descriptive note. The method never
79    /// returns an error: at the executor level we want a partial result for
80    /// every site, not abort-on-first-failure semantics.
81    ///
82    /// When ban detection classifies a response as `rate_limited` /
83    /// `cloudflare_challenge`, the call is retried with jittered exponential
84    /// backoff (configurable via [`ClientBuilder::max_retries`]). Non-ban
85    /// Uncertain (network errors, body read failures) is **not** retried —
86    /// those failures rarely fix themselves in the seconds-to-minutes window
87    /// we'd block for.
88    #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
89    pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
90        let mut attempt: u32 = 0;
91        loop {
92            let outcome = self.probe_once(site, username).await;
93            if !retry::should_retry(&outcome, attempt, &self.retry) {
94                return outcome;
95            }
96            let delay = retry::backoff_delay(attempt, &self.retry);
97            tracing::info!(
98                site = %site.name,
99                attempt = attempt + 1,
100                reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
101                ?delay,
102                "transient ban, retrying",
103            );
104            tokio::time::sleep(delay).await;
105            attempt += 1;
106        }
107    }
108
109    /// Fetch a URL and return raw response data (status, final URL, body)
110    /// with the same throttle / User-Agent / proxy machinery as `check`,
111    /// but without signal evaluation or retry.
112    ///
113    /// Returns `None` on any network/transport error. Intended for
114    /// diagnostics such as `adler --doctor --fix`, which diffs the
115    /// responses for a known-present and a nonsense user to derive a
116    /// signature.
117    pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
118        let host = host_of(url);
119        if let Some(global) = &self.global_throttle {
120            global.wait(GLOBAL_THROTTLE_KEY).await;
121        }
122        self.throttle.wait(&host).await;
123        let mut request = self.http.client().get(url);
124        if let Some(ua) = self.pick_user_agent() {
125            request = request.header(reqwest::header::USER_AGENT, ua);
126        }
127        let response = request.send().await.ok()?;
128        let status = response.status().as_u16();
129        let final_url = response.url().to_string();
130        let body = response.text().await.unwrap_or_default();
131        Some(RawResponse {
132            status,
133            final_url,
134            body,
135        })
136    }
137
138    /// Same as [`Self::fetch`] but routes through the configured browser
139    /// backend when the site is tagged `bot-protected` and a backend is
140    /// available. Used by [`doctor::suggest_fix`](crate::doctor::suggest_fix)
141    /// so that the diff-derivation works against the JS-rendered page
142    /// (login wall vs. real profile) rather than two identical raw-HTTP
143    /// shells.
144    ///
145    /// Falls back to raw HTTP if (a) no browser is configured, (b) the
146    /// site isn't `bot-protected`, or (c) the browser fetch fails — so
147    /// callers get the same `Option<RawResponse>` shape either way.
148    pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
149        if let Some(backend) = self.browser.as_deref() {
150            let has_tag = site
151                .tags
152                .iter()
153                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
154            if has_tag || !site.protection.is_empty() {
155                let parsed = url::Url::parse(url).ok()?;
156                match backend
157                    .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
158                    .await
159                {
160                    Ok(page) => {
161                        return Some(RawResponse {
162                            status: page.status,
163                            final_url: page.final_url.to_string(),
164                            body: page.body,
165                        });
166                    }
167                    Err(err) => {
168                        tracing::warn!(
169                            site = %site.name, %url, error = %err,
170                            "browser fetch failed in doctor; falling back to raw HTTP",
171                        );
172                    }
173                }
174            }
175        }
176        self.fetch(url).await
177    }
178
179    /// Pick a User-Agent for the next request from the rotation pool, or
180    /// `None` to fall back on the client's fixed header.
181    fn pick_user_agent(&self) -> Option<&str> {
182        match self.user_agents.len() {
183            0 => None,
184            1 => Some(&self.user_agents[0]),
185            n => Some(&self.user_agents[fastrand::usize(0..n)]),
186        }
187    }
188
189    // Splitting probe_once into helpers would scatter the request/response
190    // flow that has to read top-to-bottom; one long function reads better.
191    #[allow(clippy::too_many_lines)]
192    async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
193        let url = site.url_for(username);
194
195        // Site-level username constraint (Sherlock's `regexCheck`).
196        // Mismatch → skip the probe entirely. Saves a request and
197        // sidesteps the false-positive class where a site 404s on
198        // illegal usernames in a way our signal can't distinguish
199        // from a missing account. If the pattern fails to compile
200        // (Sherlock occasionally uses lookarounds, which our `regex`
201        // crate can't express), we let validate's warn-log stand
202        // and silently fall through — the rest of the probe still
203        // works.
204        if let Some(pat) = &site.regex_check {
205            if let Ok(re) = regex::Regex::new(pat) {
206                if !re.is_match(username.as_str()) {
207                    return uncertain(
208                        &site.name,
209                        url,
210                        Instant::now(),
211                        UncertainReason::UsernameNotAllowed,
212                    );
213                }
214            }
215        }
216
217        // Auto-route bot-protected sites through the browser backend when
218        // one is configured. Raw HTTP can't see past their JS/login wall,
219        // so this is the only way they ever produce a Found verdict.
220        // A site is "bot-protected" in the routing sense if it carries
221        // the legacy tag OR declares any specific protection mechanism
222        // via the new `protection` field — either signal is enough.
223        if let Some(backend) = &self.browser {
224            let has_tag = site
225                .tags
226                .iter()
227                .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
228            if has_tag || !site.protection.is_empty() {
229                if self.browser_budget.try_consume() {
230                    let started = Instant::now();
231                    let req = FetchRequest {
232                        method: site.request_method,
233                        url: &url,
234                        body: None,
235                        user_agent: None,
236                        headers: &site.request_headers,
237                        want_body: true,
238                    };
239                    let fetcher = BrowserFetcher::new(Arc::clone(backend));
240                    return match fetcher.fetch(&req).await {
241                        Ok(resp) => self.finish(site, url, started, &resp),
242                        Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
243                    };
244                }
245                tracing::warn!(site = %site.name, "browser budget exhausted");
246                return uncertain(
247                    &site.name,
248                    url,
249                    Instant::now(),
250                    UncertainReason::BrowserBudget,
251                );
252            }
253        }
254
255        // Egress selection: route the HTTP path through a geo / IP-type
256        // matching proxy when the site's access policy demands one. An
257        // unconstrained policy uses the default egress; a constrained
258        // policy with no matching egress is reported `GeoUnavailable`
259        // rather than fetched from the wrong location (a false
260        // `NotFound` would be worse than an honest `Uncertain`).
261        let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
262            EgressChoice::Default => Arc::clone(&self.http),
263            EgressChoice::Use(fetcher) => fetcher,
264            EgressChoice::Unavailable => {
265                return uncertain(
266                    &site.name,
267                    url,
268                    Instant::now(),
269                    UncertainReason::GeoUnavailable,
270                );
271            }
272        };
273
274        let host = host_of(&url);
275
276        // robots.txt gate, before consuming a throttle slot or probing.
277        if let Some(robots) = &self.robots {
278            if let Some((origin, path)) = origin_and_path(&url) {
279                if !robots.allowed(&origin, &path).await {
280                    tracing::debug!(%url, "skipped by robots.txt");
281                    return uncertain(
282                        &site.name,
283                        url,
284                        Instant::now(),
285                        UncertainReason::RobotsDisallowed,
286                    );
287                }
288            }
289        }
290
291        // Global cap first (gates every request), then per-host spacing.
292        if let Some(global) = &self.global_throttle {
293            global.wait(GLOBAL_THROTTLE_KEY).await;
294        }
295        self.throttle.wait(&host).await;
296        let started = Instant::now();
297        tracing::debug!(%url, %host, "probing");
298
299        // Read the body only if a signal needs it, or enrichment is on
300        // and the site declares extractor rules (extraction needs it).
301        let want_enrich = self.enrich && !site.extract.is_empty();
302        let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
303
304        // POST sites carry their own body payload (the username goes in
305        // the body, not the URL — e.g. Anilist's GraphQL endpoint).
306        // `{username}` in `Site::request_body` is substituted here,
307        // mirroring URL substitution.
308        let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
309            const USERNAME_PH: &str = "{username}";
310            site.request_body
311                .as_deref()
312                .map(|t| t.replace(USERNAME_PH, username.as_str()))
313        } else {
314            None
315        };
316
317        let req = FetchRequest {
318            method: site.request_method,
319            url: &url,
320            body: body_for_post.as_deref(),
321            user_agent: self.pick_user_agent(),
322            headers: &site.request_headers,
323            want_body: needs_body,
324        };
325        match egress.fetch(&req).await {
326            Ok(resp) => self.finish(site, url, started, &resp),
327            Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
328        }
329    }
330
331    /// Evaluate a fetched response against the site's signals and build
332    /// the outcome. Shared by the HTTP and browser transports so the
333    /// verdict / evidence / enrichment logic lives in exactly one place.
334    fn finish(
335        &self,
336        site: &Site,
337        url: String,
338        started: Instant,
339        resp: &crate::transport::FetchResponse,
340    ) -> CheckOutcome {
341        let probe = Probe {
342            status: resp.status,
343            final_url: &resp.final_url,
344            body: &resp.body,
345        };
346        let votes: Vec<(&Signal, SignalVerdict)> = site
347            .signals
348            .iter()
349            .map(|s| (s, s.evaluate(&probe)))
350            .collect();
351        let kind = aggregate(votes.iter().map(|(_, v)| *v));
352        let mut result = outcome(&site.name, url, started, kind);
353        // Record which signals produced the verdict (the winning polarity).
354        let winning = match kind {
355            MatchKind::Found => Some(SignalVerdict::Found),
356            MatchKind::NotFound => Some(SignalVerdict::NotFound),
357            MatchKind::Uncertain => None,
358        };
359        if let Some(want) = winning {
360            result.evidence = votes
361                .iter()
362                .filter(|(_, v)| *v == want)
363                .map(|(s, _)| s.describe_match(&probe))
364                .collect();
365        }
366        if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
367            result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
368        }
369        result
370    }
371}
372
373/// Raw response data returned by [`Client::fetch`] for diagnostics.
374#[derive(Debug, Clone)]
375pub struct RawResponse {
376    /// HTTP status code.
377    pub status: u16,
378    /// Final URL after redirects.
379    pub final_url: String,
380    /// Decoded response body.
381    pub body: String,
382}
383
384/// Builder for [`Client`].
385#[derive(Clone)]
386#[must_use = "ClientBuilder does nothing until `.build()` is called"]
387pub struct ClientBuilder {
388    timeout: Duration,
389    connect_timeout: Duration,
390    user_agent: String,
391    follow_redirects: bool,
392    redirect_limit: usize,
393    min_request_interval: Duration,
394    max_rps: Option<NonZeroU32>,
395    retry: RetryPolicy,
396    proxy: Option<String>,
397    user_agents: Vec<String>,
398    enrich: bool,
399    respect_robots: bool,
400    browser: Option<Arc<dyn BrowserBackend>>,
401    browser_budget: usize,
402    egress: Vec<EgressSpec>,
403}
404
405impl Default for ClientBuilder {
406    fn default() -> Self {
407        Self {
408            timeout: DEFAULT_TIMEOUT,
409            connect_timeout: DEFAULT_CONNECT_TIMEOUT,
410            user_agent: default_user_agent(),
411            follow_redirects: true,
412            redirect_limit: DEFAULT_REDIRECT_LIMIT,
413            min_request_interval: DEFAULT_PER_HOST_INTERVAL,
414            max_rps: None,
415            retry: RetryPolicy::default(),
416            proxy: None,
417            user_agents: Vec::new(),
418            enrich: false,
419            respect_robots: false,
420            browser: None,
421            browser_budget: DEFAULT_BROWSER_BUDGET,
422            egress: Vec::new(),
423        }
424    }
425}
426
427impl ClientBuilder {
428    /// Per-request timeout (covers connect, headers, and body read).
429    pub fn timeout(mut self, timeout: Duration) -> Self {
430        self.timeout = timeout;
431        self
432    }
433
434    /// TCP-connect timeout, applied independently of the request timeout.
435    pub fn connect_timeout(mut self, timeout: Duration) -> Self {
436        self.connect_timeout = timeout;
437        self
438    }
439
440    /// Override the `User-Agent` header sent on every request.
441    pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
442        self.user_agent = user_agent.into();
443        self
444    }
445
446    /// Toggle automatic redirect following. Defaults to `true`; disable when
447    /// using [`crate::Signal::RedirectAbsent`] is undesirable for a run.
448    pub fn follow_redirects(mut self, follow: bool) -> Self {
449        self.follow_redirects = follow;
450        self
451    }
452
453    /// Minimum time between consecutive requests to the same host.
454    ///
455    /// Defaults to 100 ms (≈ 10 RPS per host) — enough headroom to avoid
456    /// rate-limit responses on common OSINT targets while keeping fan-out
457    /// across many sites fast.
458    pub fn min_request_interval(mut self, interval: Duration) -> Self {
459        self.min_request_interval = interval;
460        self
461    }
462
463    /// Cap the total request rate across *all* hosts to `rps` requests per
464    /// second. Independent of (and composed with) the per-host interval —
465    /// useful on a metered connection or behind a shared-quota proxy.
466    /// Uncapped by default.
467    pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
468        self.max_rps = Some(rps);
469        self
470    }
471
472    /// Maximum retry attempts after a transient ban response. Defaults to 2
473    /// (so up to 3 total tries). Set to `0` to disable retry entirely.
474    pub fn max_retries(mut self, n: u32) -> Self {
475        self.retry.max_retries = n;
476        self
477    }
478
479    /// Base delay for the first retry. Subsequent retries double until
480    /// reaching [`Self::max_backoff_delay`]. Defaults to 500 ms.
481    pub fn base_backoff_delay(mut self, d: Duration) -> Self {
482        self.retry.base_delay = d;
483        self
484    }
485
486    /// Cap on a single backoff delay (pre-jitter). Defaults to 30 s.
487    pub fn max_backoff_delay(mut self, d: Duration) -> Self {
488        self.retry.max_delay = d;
489        self
490    }
491
492    /// Route all requests through a proxy. Accepts `http://`, `https://`,
493    /// and `socks5://` URLs. For Tor, pass `socks5://127.0.0.1:9050`.
494    pub fn proxy(mut self, url: impl Into<String>) -> Self {
495        self.proxy = Some(url.into());
496        self
497    }
498
499    /// Rotate the `User-Agent` header per request, picking uniformly at
500    /// random from `agents`. An empty list (the default) keeps the single
501    /// fixed User-Agent. Useful for reducing trivial fingerprinting.
502    pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
503        self.user_agents = agents;
504        self
505    }
506
507    /// Extract profile fields (per [`crate::Site::extract`]) from `Found`
508    /// pages. Off by default; enables an extra body read for matching sites.
509    pub fn enrich(mut self, enrich: bool) -> Self {
510        self.enrich = enrich;
511        self
512    }
513
514    /// Honor each host's `robots.txt`: probes to disallowed paths are
515    /// skipped (reported `Uncertain`, note `robots_disallowed`). Off by
516    /// default. Adds one cached `robots.txt` fetch per origin.
517    pub fn respect_robots(mut self, respect: bool) -> Self {
518        self.respect_robots = respect;
519        self
520    }
521
522    /// Attach a browser backend. Sites tagged `bot-protected` will be
523    /// routed through it instead of the raw HTTP path, up to the
524    /// [`browser_budget`](Self::browser_budget) cap.
525    pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
526        self.browser = Some(backend);
527        self
528    }
529
530    /// Per-scan cap on how many `bot-protected` sites are allowed to use
531    /// the browser backend. Once exhausted, the rest fall back to
532    /// `Uncertain(BrowserBudget)`. Defaults to
533    /// [`DEFAULT_BROWSER_BUDGET`].
534    pub const fn browser_budget(mut self, cap: usize) -> Self {
535        self.browser_budget = cap;
536        self
537    }
538
539    /// Configure the egress pool: proxies tagged by country / IP type
540    /// that sites with an `access` policy can require. Sites without a
541    /// policy are unaffected (they use the default egress / `--proxy`).
542    /// Replaces any previously set pool.
543    pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
544        self.egress = egress;
545        self
546    }
547
548    /// Build a [`Client`].
549    pub fn build(self) -> Result<Client> {
550        let inner = build_reqwest(
551            &self.user_agent,
552            self.timeout,
553            self.connect_timeout,
554            self.follow_redirects,
555            self.redirect_limit,
556            self.proxy.as_deref(),
557        )?;
558
559        // One HTTP client per configured egress — `reqwest` bakes the
560        // proxy in at build time, so geo / IP-type routing means a
561        // distinct client per proxy, paired with its match metadata.
562        let mut egress_entries = Vec::with_capacity(self.egress.len());
563        for spec in &self.egress {
564            let client = build_reqwest(
565                &self.user_agent,
566                self.timeout,
567                self.connect_timeout,
568                self.follow_redirects,
569                self.redirect_limit,
570                Some(&spec.url),
571            )?;
572            egress_entries.push((
573                spec.country.clone(),
574                spec.kind,
575                Arc::new(HttpFetcher::new(client)),
576            ));
577        }
578
579        let global_throttle = self.max_rps.map(|rps| {
580            // Min spacing between any two requests = 1s / rps.
581            let interval = Duration::from_secs(1) / rps.get();
582            HostThrottle::new(interval)
583        });
584        let robots = self
585            .respect_robots
586            .then(|| RobotsCache::new(inner.clone(), "adler"));
587        Ok(Client {
588            http: Arc::new(HttpFetcher::new(inner)),
589            egress: Arc::new(EgressPool::new(egress_entries)),
590            throttle: HostThrottle::new(self.min_request_interval),
591            global_throttle,
592            retry: self.retry,
593            user_agents: Arc::from(self.user_agents),
594            enrich: self.enrich,
595            robots,
596            browser: self.browser,
597            browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
598        })
599    }
600}
601
602/// Build a configured `reqwest::Client`, optionally routed through a
603/// proxy. Shared by the default client and every egress in the pool so
604/// they get identical timeout / redirect / User-Agent settings.
605fn build_reqwest(
606    user_agent: &str,
607    timeout: Duration,
608    connect_timeout: Duration,
609    follow_redirects: bool,
610    redirect_limit: usize,
611    proxy: Option<&str>,
612) -> Result<reqwest::Client> {
613    let redirect_policy = if follow_redirects {
614        redirect::Policy::limited(redirect_limit)
615    } else {
616        redirect::Policy::none()
617    };
618    let mut builder = reqwest::Client::builder()
619        .user_agent(user_agent.to_owned())
620        .timeout(timeout)
621        .connect_timeout(connect_timeout)
622        .redirect(redirect_policy);
623    if let Some(proxy_url) = proxy {
624        // reqwest treats a schemeless string (e.g. "not-a-url") as a host
625        // and silently defaults it to http://, so every probe would fail
626        // confusingly. Require an explicit, supported scheme up front.
627        const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
628        if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
629            return Err(Error::HttpSetup {
630                message: format!(
631                    "invalid proxy {proxy_url:?}: must start with one of {}",
632                    SCHEMES.join(", ")
633                ),
634            });
635        }
636        let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
637            message: format!("invalid proxy {proxy_url:?}: {e}"),
638        })?;
639        builder = builder.proxy(proxy);
640    }
641    builder.build().map_err(|e| Error::HttpSetup {
642        message: e.to_string(),
643    })
644}
645
646/// Default ceiling on browser-backed probes per scan when no other value
647/// is specified.
648///
649/// Sized as ~5× the typical `bot-protected` registry subset — comfortable
650/// headroom while still being a guardrail against a misconfigured flag
651/// burning a whole Browserbase quota.
652pub const DEFAULT_BROWSER_BUDGET: usize = 50;
653
654impl fmt::Debug for Client {
655    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
656        f.debug_struct("Client")
657            .field("throttle", &self.throttle)
658            .field("global_throttle", &self.global_throttle)
659            .field("retry", &self.retry)
660            .field("user_agents", &self.user_agents)
661            .field("enrich", &self.enrich)
662            .field("robots", &self.robots.is_some())
663            .field("browser", &self.browser.is_some())
664            .field("browser_budget", &self.browser_budget)
665            .finish_non_exhaustive()
666    }
667}
668
669impl fmt::Debug for ClientBuilder {
670    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
671        f.debug_struct("ClientBuilder")
672            .field("timeout", &self.timeout)
673            .field("connect_timeout", &self.connect_timeout)
674            .field("user_agent", &self.user_agent)
675            .field("follow_redirects", &self.follow_redirects)
676            .field("redirect_limit", &self.redirect_limit)
677            .field("min_request_interval", &self.min_request_interval)
678            .field("max_rps", &self.max_rps)
679            .field("retry", &self.retry)
680            .field("proxy", &self.proxy)
681            .field("user_agents", &self.user_agents)
682            .field("enrich", &self.enrich)
683            .field("respect_robots", &self.respect_robots)
684            .field("browser", &self.browser.is_some())
685            .field("browser_budget", &self.browser_budget)
686            .field("egress", &self.egress)
687            .finish()
688    }
689}
690
691const BOT_PROTECTED_TAG: &str = "bot-protected";
692
693fn default_user_agent() -> String {
694    format!("adler/{}", env!("CARGO_PKG_VERSION"))
695}
696
697fn host_of(url: &str) -> String {
698    reqwest::Url::parse(url)
699        .ok()
700        .and_then(|u| u.host_str().map(str::to_owned))
701        .unwrap_or_else(|| "unknown".into())
702}
703
704/// Split a URL into its origin (`scheme://host[:port]`) and path-with-query,
705/// for `robots.txt` lookup. `None` if the URL won't parse or lacks a host.
706fn origin_and_path(url: &str) -> Option<(String, String)> {
707    let parsed = reqwest::Url::parse(url).ok()?;
708    let host = parsed.host_str()?;
709    let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
710    let origin = format!("{}://{host}{port}", parsed.scheme());
711    let path = parsed.query().map_or_else(
712        || parsed.path().to_owned(),
713        |q| format!("{}?{q}", parsed.path()),
714    );
715    Some((origin, path))
716}
717
718fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
719    CheckOutcome {
720        site: site.to_owned(),
721        url,
722        kind,
723        reason: None,
724        elapsed_ms: elapsed_ms(started),
725        enrichment: std::collections::BTreeMap::new(),
726        evidence: Vec::new(),
727    }
728}
729
730fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
731    CheckOutcome {
732        site: site.to_owned(),
733        url,
734        kind: MatchKind::Uncertain,
735        reason: Some(reason),
736        elapsed_ms: elapsed_ms(started),
737        enrichment: std::collections::BTreeMap::new(),
738        evidence: Vec::new(),
739    }
740}
741
742fn elapsed_ms(started: Instant) -> u64 {
743    u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
744}
745
746#[cfg(test)]
747mod tests {
748    use super::*;
749    use crate::browser::RenderedPage;
750    use crate::site::{Signal, UrlTemplate};
751    use wiremock::matchers::{any, method, path};
752    use wiremock::{Mock, MockServer, ResponseTemplate};
753
754    fn build_client() -> Client {
755        Client::builder()
756            .timeout(Duration::from_secs(2))
757            // Tests share `127.0.0.1` as host — keep throttle out of the
758            // way for everything but the dedicated throttle test below.
759            .min_request_interval(Duration::ZERO)
760            // Default retry would re-hit ban-test mocks; tests opt in
761            // explicitly when they want to exercise the retry path.
762            .max_retries(0)
763            .build()
764            .expect("client builds")
765    }
766
767    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
768        Site {
769            name: "Mock".into(),
770            url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
771            signals,
772            known_present: None,
773            known_absent: None,
774            extract: Vec::new(),
775            tags: Vec::new(),
776            request_headers: std::collections::BTreeMap::new(),
777            regex_check: None,
778            engine: None,
779            strip_bad_char: None,
780            request_method: crate::site::HttpMethod::Get,
781            request_body: None,
782            protection: Vec::new(),
783            disabled: false,
784            source: None,
785            popularity: None,
786            access: crate::AccessPolicy::default(),
787        }
788    }
789
790    fn user() -> Username {
791        Username::new("alice").unwrap()
792    }
793
794    #[tokio::test]
795    async fn regex_check_short_circuits_before_any_request() {
796        // Stand up a mock that would 200 on *anything* — if probe_once
797        // failed to short-circuit on regex mismatch, the username
798        // "alice" (5 chars) would resolve to Found here.
799        let server = MockServer::start().await;
800        Mock::given(any())
801            .respond_with(ResponseTemplate::new(200))
802            .mount(&server)
803            .await;
804        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
805        // The site only accepts usernames of 8+ chars; "alice" is 5.
806        site.regex_check = Some("^[A-Za-z]{8,}$".into());
807        let outcome = build_client().check(&site, &user()).await;
808        assert_eq!(outcome.kind, MatchKind::Uncertain);
809        assert!(
810            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
811            "expected UsernameNotAllowed, got {:?}",
812            outcome.reason,
813        );
814        // No request should have hit the mock — assert by counting
815        // received_requests on the wiremock server.
816        let recvd = server.received_requests().await.unwrap_or_default();
817        assert_eq!(
818            recvd.len(),
819            0,
820            "regex_check mismatch must skip the HTTP request entirely"
821        );
822    }
823
824    #[tokio::test]
825    async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
826        // A mock that would 200 on anything — if the geo gate failed to
827        // short-circuit, "alice" would resolve to Found here.
828        let server = MockServer::start().await;
829        Mock::given(any())
830            .respond_with(ResponseTemplate::new(200))
831            .mount(&server)
832            .await;
833        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
834        // Require a Polish egress; the default client has no egress pool,
835        // so nothing can satisfy it.
836        site.access = crate::access::AccessPolicy {
837            geo: vec![crate::access::CountryCode::new("pl").unwrap()],
838            ip_type: None,
839        };
840        let outcome = build_client().check(&site, &user()).await;
841        assert_eq!(outcome.kind, MatchKind::Uncertain);
842        assert!(
843            matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
844            "expected GeoUnavailable, got {:?}",
845            outcome.reason,
846        );
847        // The site must NOT have been probed — an unreachable geo is not
848        // evidence of absence, and we don't fetch from the wrong location.
849        let recvd = server.received_requests().await.unwrap_or_default();
850        assert_eq!(
851            recvd.len(),
852            0,
853            "geo-unavailable must skip the HTTP request entirely"
854        );
855    }
856
857    #[tokio::test]
858    async fn regex_check_pass_proceeds_to_probe() {
859        let server = MockServer::start().await;
860        Mock::given(any())
861            .and(path("/alice"))
862            .respond_with(ResponseTemplate::new(200))
863            .mount(&server)
864            .await;
865        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
866        // Pattern that matches "alice".
867        site.regex_check = Some("^[a-z]{3,}$".into());
868        let outcome = build_client().check(&site, &user()).await;
869        assert_eq!(outcome.kind, MatchKind::Found);
870    }
871
872    #[tokio::test]
873    async fn status_signal_reports_found_on_match() {
874        let server = MockServer::start().await;
875        Mock::given(any())
876            .and(path("/alice"))
877            .respond_with(ResponseTemplate::new(200))
878            .mount(&server)
879            .await;
880        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
881        let outcome = build_client().check(&site, &user()).await;
882        assert_eq!(outcome.kind, MatchKind::Found);
883        assert!(outcome.url.ends_with("/alice"));
884        assert!(outcome.reason.is_none());
885        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
886    }
887
888    #[tokio::test]
889    async fn status_signal_pair_reports_not_found_on_404() {
890        let server = MockServer::start().await;
891        Mock::given(any())
892            .and(path("/alice"))
893            .respond_with(ResponseTemplate::new(404))
894            .mount(&server)
895            .await;
896        let site = site_with(
897            &server,
898            vec![
899                Signal::StatusFound { codes: vec![200] },
900                Signal::StatusNotFound { codes: vec![404] },
901            ],
902        );
903        let outcome = build_client().check(&site, &user()).await;
904        assert_eq!(outcome.kind, MatchKind::NotFound);
905        // Only the NotFound-voting signal is cited as evidence.
906        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
907    }
908
909    #[tokio::test]
910    async fn body_absent_signal_detects_missing_account() {
911        let server = MockServer::start().await;
912        Mock::given(any())
913            .and(path("/alice"))
914            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
915            .mount(&server)
916            .await;
917        let site = site_with(
918            &server,
919            vec![Signal::BodyAbsent {
920                text: "Profile not found".into(),
921            }],
922        );
923        let outcome = build_client().check(&site, &user()).await;
924        assert_eq!(outcome.kind, MatchKind::NotFound);
925    }
926
927    #[tokio::test]
928    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
929        // Phase 2 semantics: absence of an absence-marker is not evidence
930        // of presence — it just means we have no signal that fired.
931        let server = MockServer::start().await;
932        Mock::given(any())
933            .and(path("/alice"))
934            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
935            .mount(&server)
936            .await;
937        let site = site_with(
938            &server,
939            vec![Signal::BodyAbsent {
940                text: "Profile not found".into(),
941            }],
942        );
943        let outcome = build_client().check(&site, &user()).await;
944        assert_eq!(outcome.kind, MatchKind::Uncertain);
945    }
946
947    #[tokio::test]
948    async fn body_present_plus_absent_resolve_to_found() {
949        let server = MockServer::start().await;
950        Mock::given(any())
951            .and(path("/alice"))
952            .respond_with(
953                ResponseTemplate::new(200)
954                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
955            )
956            .mount(&server)
957            .await;
958        let site = site_with(
959            &server,
960            vec![
961                Signal::BodyPresent {
962                    text: "profile-card".into(),
963                },
964                Signal::BodyAbsent {
965                    text: "Profile not found".into(),
966                },
967            ],
968        );
969        let outcome = build_client().check(&site, &user()).await;
970        assert_eq!(outcome.kind, MatchKind::Found);
971    }
972
973    #[tokio::test]
974    async fn redirect_absent_signal_detects_missing_account() {
975        let server = MockServer::start().await;
976        Mock::given(any())
977            .and(path("/alice"))
978            .respond_with(
979                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
980            )
981            .mount(&server)
982            .await;
983        Mock::given(any())
984            .and(path("/login"))
985            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
986            .mount(&server)
987            .await;
988        let site = site_with(
989            &server,
990            vec![Signal::RedirectAbsent {
991                fragment: "/login".into(),
992            }],
993        );
994        let outcome = build_client().check(&site, &user()).await;
995        assert_eq!(outcome.kind, MatchKind::NotFound);
996    }
997
998    #[tokio::test]
999    async fn negative_signal_wins_over_positive() {
1000        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
1001        // (error marker appears). Negative-priority aggregation → NotFound.
1002        // This is the canonical Sherlock "message" pattern: a site that
1003        // returns 200 for everyone and differentiates via an error string.
1004        let server = MockServer::start().await;
1005        Mock::given(any())
1006            .and(path("/alice"))
1007            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1008            .mount(&server)
1009            .await;
1010        let site = site_with(
1011            &server,
1012            vec![
1013                Signal::StatusFound { codes: vec![200] },
1014                Signal::BodyAbsent {
1015                    text: "Profile not found".into(),
1016                },
1017            ],
1018        );
1019        let outcome = build_client().check(&site, &user()).await;
1020        assert_eq!(outcome.kind, MatchKind::NotFound);
1021    }
1022
1023    #[tokio::test]
1024    async fn network_failure_yields_uncertain() {
1025        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1026        let port = listener.local_addr().unwrap().port();
1027        drop(listener);
1028
1029        let site = Site {
1030            name: "Dead".into(),
1031            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1032            signals: vec![Signal::StatusFound { codes: vec![200] }],
1033            known_present: None,
1034            known_absent: None,
1035            extract: Vec::new(),
1036            tags: Vec::new(),
1037            request_headers: std::collections::BTreeMap::new(),
1038            regex_check: None,
1039            engine: None,
1040            strip_bad_char: None,
1041            request_method: crate::site::HttpMethod::Get,
1042            request_body: None,
1043            protection: Vec::new(),
1044            disabled: false,
1045            source: None,
1046            popularity: None,
1047            access: crate::AccessPolicy::default(),
1048        };
1049        let client = Client::builder()
1050            .timeout(Duration::from_millis(500))
1051            .connect_timeout(Duration::from_millis(500))
1052            .max_retries(0)
1053            .build()
1054            .unwrap();
1055        let outcome = client.check(&site, &user()).await;
1056        assert_eq!(outcome.kind, MatchKind::Uncertain);
1057        assert!(outcome.reason.is_some());
1058    }
1059
1060    #[tokio::test]
1061    async fn throttle_spaces_consecutive_calls_to_same_host() {
1062        let server = MockServer::start().await;
1063        Mock::given(any())
1064            .and(path("/alice"))
1065            .respond_with(ResponseTemplate::new(200))
1066            .mount(&server)
1067            .await;
1068        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1069        // Interval is intentionally much larger than typical wiremock latency
1070        // (≤10 ms locally, can spike under heavy parallel test load). Any
1071        // value too close to HTTP latency would let the first request burn
1072        // through the throttle window and make the assertion flaky.
1073        let client = Client::builder()
1074            .timeout(Duration::from_secs(2))
1075            .min_request_interval(Duration::from_millis(300))
1076            .build()
1077            .unwrap();
1078
1079        client.check(&site, &user()).await;
1080        let started = Instant::now();
1081        client.check(&site, &user()).await;
1082        let elapsed = started.elapsed();
1083        assert!(
1084            elapsed >= Duration::from_millis(200),
1085            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1086        );
1087    }
1088
1089    #[tokio::test]
1090    async fn builder_overrides_user_agent() {
1091        let server = MockServer::start().await;
1092        Mock::given(any())
1093            .and(path("/alice"))
1094            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1095            .respond_with(ResponseTemplate::new(200))
1096            .mount(&server)
1097            .await;
1098        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1099        let client = Client::builder()
1100            .user_agent("adler-test/1.0")
1101            .build()
1102            .unwrap();
1103        let outcome = client.check(&site, &user()).await;
1104        assert_eq!(outcome.kind, MatchKind::Found);
1105    }
1106
1107    #[tokio::test]
1108    async fn rate_limit_429_yields_uncertain_with_note() {
1109        let server = MockServer::start().await;
1110        Mock::given(any())
1111            .and(path("/alice"))
1112            .respond_with(ResponseTemplate::new(429))
1113            .mount(&server)
1114            .await;
1115        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1116        let outcome = build_client().check(&site, &user()).await;
1117        assert_eq!(outcome.kind, MatchKind::Uncertain);
1118        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1119    }
1120
1121    #[tokio::test]
1122    async fn cloudflare_server_header_yields_uncertain() {
1123        let server = MockServer::start().await;
1124        Mock::given(any())
1125            .and(path("/alice"))
1126            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1127            .mount(&server)
1128            .await;
1129        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1130        let outcome = build_client().check(&site, &user()).await;
1131        assert_eq!(outcome.kind, MatchKind::Uncertain);
1132        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1133    }
1134
1135    #[tokio::test]
1136    async fn cloudflare_interstitial_in_body_yields_uncertain() {
1137        // Body-based ban detection only runs when a signal already needs
1138        // the body — this site uses BodyAbsent so the body is read.
1139        let server = MockServer::start().await;
1140        Mock::given(any())
1141            .and(path("/alice"))
1142            .respond_with(
1143                ResponseTemplate::new(200)
1144                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1145            )
1146            .mount(&server)
1147            .await;
1148        let site = site_with(
1149            &server,
1150            vec![Signal::BodyAbsent {
1151                text: "Profile not found".into(),
1152            }],
1153        );
1154        let outcome = build_client().check(&site, &user()).await;
1155        assert_eq!(outcome.kind, MatchKind::Uncertain);
1156        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1157    }
1158
1159    #[tokio::test]
1160    async fn ban_detection_does_not_fire_on_legitimate_403() {
1161        let server = MockServer::start().await;
1162        Mock::given(any())
1163            .and(path("/alice"))
1164            .respond_with(ResponseTemplate::new(403))
1165            .mount(&server)
1166            .await;
1167        let site = site_with(
1168            &server,
1169            vec![
1170                Signal::StatusFound { codes: vec![200] },
1171                Signal::StatusNotFound { codes: vec![403] },
1172            ],
1173        );
1174        let outcome = build_client().check(&site, &user()).await;
1175        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
1176        assert_eq!(outcome.kind, MatchKind::NotFound);
1177        assert!(outcome.reason.is_none());
1178    }
1179
1180    #[tokio::test]
1181    async fn retry_recovers_after_transient_429() {
1182        let server = MockServer::start().await;
1183        // First request: 429. Subsequent: 200.
1184        Mock::given(any())
1185            .and(path("/alice"))
1186            .respond_with(ResponseTemplate::new(429))
1187            .up_to_n_times(1)
1188            .mount(&server)
1189            .await;
1190        Mock::given(any())
1191            .and(path("/alice"))
1192            .respond_with(ResponseTemplate::new(200))
1193            .mount(&server)
1194            .await;
1195        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1196        let client = Client::builder()
1197            .timeout(Duration::from_secs(2))
1198            .min_request_interval(Duration::ZERO)
1199            .max_retries(2)
1200            .base_backoff_delay(Duration::from_millis(20))
1201            .max_backoff_delay(Duration::from_millis(100))
1202            .build()
1203            .unwrap();
1204        let outcome = client.check(&site, &user()).await;
1205        assert_eq!(outcome.kind, MatchKind::Found);
1206        assert!(outcome.reason.is_none());
1207    }
1208
1209    #[tokio::test]
1210    async fn retry_exhausts_and_returns_uncertain() {
1211        let server = MockServer::start().await;
1212        Mock::given(any())
1213            .and(path("/alice"))
1214            .respond_with(ResponseTemplate::new(429))
1215            .mount(&server)
1216            .await;
1217        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1218        let client = Client::builder()
1219            .timeout(Duration::from_secs(2))
1220            .min_request_interval(Duration::ZERO)
1221            .max_retries(2)
1222            .base_backoff_delay(Duration::from_millis(10))
1223            .max_backoff_delay(Duration::from_millis(50))
1224            .build()
1225            .unwrap();
1226        let outcome = client.check(&site, &user()).await;
1227        assert_eq!(outcome.kind, MatchKind::Uncertain);
1228        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1229    }
1230
1231    #[tokio::test]
1232    async fn retry_does_not_fire_on_network_error() {
1233        // Connection refused → Uncertain note starts with "request:", not a
1234        // ban marker. We must NOT retry — otherwise a single dead site
1235        // burns the full backoff budget before reporting.
1236        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1237        let port = listener.local_addr().unwrap().port();
1238        drop(listener);
1239        let site = Site {
1240            name: "Dead".into(),
1241            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1242            signals: vec![Signal::StatusFound { codes: vec![200] }],
1243            known_present: None,
1244            known_absent: None,
1245            extract: Vec::new(),
1246            tags: Vec::new(),
1247            request_headers: std::collections::BTreeMap::new(),
1248            regex_check: None,
1249            engine: None,
1250            strip_bad_char: None,
1251            request_method: crate::site::HttpMethod::Get,
1252            request_body: None,
1253            protection: Vec::new(),
1254            disabled: false,
1255            source: None,
1256            popularity: None,
1257            access: crate::AccessPolicy::default(),
1258        };
1259        let client = Client::builder()
1260            .timeout(Duration::from_millis(500))
1261            .connect_timeout(Duration::from_millis(500))
1262            .min_request_interval(Duration::ZERO)
1263            .max_retries(3)
1264            .base_backoff_delay(Duration::from_secs(60))
1265            .build()
1266            .unwrap();
1267        let started = Instant::now();
1268        let outcome = client.check(&site, &user()).await;
1269        // If retry fired, we'd be sleeping minutes; instead this returns
1270        // promptly with an Uncertain.
1271        assert!(started.elapsed() < Duration::from_secs(5));
1272        assert_eq!(outcome.kind, MatchKind::Uncertain);
1273        assert!(
1274            matches!(outcome.reason, Some(UncertainReason::Network(_))),
1275            "got {:?}",
1276            outcome.reason,
1277        );
1278    }
1279
1280    #[tokio::test]
1281    async fn rotates_user_agent_per_request() {
1282        // The mock only matches when the request carries one of the pooled
1283        // UAs; if rotation weren't applied, the default adler/x.y UA would
1284        // miss and the verdict would be NotFound.
1285        let server = MockServer::start().await;
1286        Mock::given(any())
1287            .and(path("/alice"))
1288            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1289            .respond_with(ResponseTemplate::new(200))
1290            .mount(&server)
1291            .await;
1292        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1293        let client = Client::builder()
1294            .min_request_interval(Duration::ZERO)
1295            .max_retries(0)
1296            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1297            .build()
1298            .unwrap();
1299        let outcome = client.check(&site, &user()).await;
1300        assert_eq!(outcome.kind, MatchKind::Found);
1301    }
1302
1303    #[test]
1304    fn invalid_proxy_url_fails_build() {
1305        let err = Client::builder().proxy("not a url").build().unwrap_err();
1306        assert!(matches!(err, Error::HttpSetup { .. }));
1307    }
1308
1309    #[test]
1310    fn schemeless_proxy_is_rejected_up_front() {
1311        // reqwest would silently treat this as a host; we require a scheme.
1312        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1313        let Error::HttpSetup { message } = err else {
1314            panic!("expected HttpSetup, got {err:?}");
1315        };
1316        assert!(message.contains("must start with"), "{message}");
1317    }
1318
1319    #[test]
1320    fn socks5_proxy_scheme_is_accepted() {
1321        // Valid scheme + endpoint builds fine (no connection is attempted).
1322        assert!(
1323            Client::builder()
1324                .proxy("socks5://127.0.0.1:9050")
1325                .build()
1326                .is_ok()
1327        );
1328    }
1329
1330    #[tokio::test]
1331    async fn global_rps_cap_spaces_requests_across_hosts() {
1332        // Two distinct host paths; per-host throttle is disabled, so any
1333        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1334        let server = MockServer::start().await;
1335        Mock::given(any())
1336            .respond_with(ResponseTemplate::new(200))
1337            .mount(&server)
1338            .await;
1339        let site_a = Site {
1340            name: "A".into(),
1341            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1342            signals: vec![Signal::StatusFound { codes: vec![200] }],
1343            known_present: None,
1344            known_absent: None,
1345            extract: Vec::new(),
1346            tags: Vec::new(),
1347            request_headers: std::collections::BTreeMap::new(),
1348            regex_check: None,
1349            engine: None,
1350            strip_bad_char: None,
1351            request_method: crate::site::HttpMethod::Get,
1352            request_body: None,
1353            protection: Vec::new(),
1354            disabled: false,
1355            source: None,
1356            popularity: None,
1357            access: crate::AccessPolicy::default(),
1358        };
1359        let site_b = Site {
1360            name: "B".into(),
1361            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1362            signals: vec![Signal::StatusFound { codes: vec![200] }],
1363            known_present: None,
1364            known_absent: None,
1365            extract: Vec::new(),
1366            tags: Vec::new(),
1367            request_headers: std::collections::BTreeMap::new(),
1368            regex_check: None,
1369            engine: None,
1370            strip_bad_char: None,
1371            request_method: crate::site::HttpMethod::Get,
1372            request_body: None,
1373            protection: Vec::new(),
1374            disabled: false,
1375            source: None,
1376            popularity: None,
1377            access: crate::AccessPolicy::default(),
1378        };
1379        // 2 RPS → ~500 ms between requests. A large interval keeps the
1380        // assertion robust even when the first probe's own duration (which
1381        // eats into the measured gap) is inflated by test instrumentation
1382        // such as coverage tooling.
1383        let client = Client::builder()
1384            .min_request_interval(Duration::ZERO)
1385            .max_retries(0)
1386            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1387            .build()
1388            .unwrap();
1389        // First request consumes the slot at t≈0; second waits ~500 ms even
1390        // though it targets a different host.
1391        client.check(&site_a, &user()).await;
1392        let started = Instant::now();
1393        client.check(&site_b, &user()).await;
1394        assert!(
1395            started.elapsed() >= Duration::from_millis(350),
1396            "global cap should space cross-host requests, got {:?}",
1397            started.elapsed(),
1398        );
1399    }
1400
1401    #[tokio::test]
1402    async fn respect_robots_skips_disallowed_paths() {
1403        let server = MockServer::start().await;
1404        Mock::given(any())
1405            .and(path("/robots.txt"))
1406            .respond_with(
1407                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1408            )
1409            .mount(&server)
1410            .await;
1411        Mock::given(any())
1412            .and(path("/no/alice"))
1413            .respond_with(ResponseTemplate::new(200))
1414            .mount(&server)
1415            .await;
1416        Mock::given(any())
1417            .and(path("/yes/alice"))
1418            .respond_with(ResponseTemplate::new(200))
1419            .mount(&server)
1420            .await;
1421        let client = Client::builder()
1422            .min_request_interval(Duration::ZERO)
1423            .max_retries(0)
1424            .respect_robots(true)
1425            .build()
1426            .unwrap();
1427
1428        let disallowed = Site {
1429            name: "No".into(),
1430            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1431            signals: vec![Signal::StatusFound { codes: vec![200] }],
1432            known_present: None,
1433            known_absent: None,
1434            extract: Vec::new(),
1435            tags: Vec::new(),
1436            request_headers: std::collections::BTreeMap::new(),
1437            regex_check: None,
1438            engine: None,
1439            strip_bad_char: None,
1440            request_method: crate::site::HttpMethod::Get,
1441            request_body: None,
1442            protection: Vec::new(),
1443            disabled: false,
1444            source: None,
1445            popularity: None,
1446            access: crate::AccessPolicy::default(),
1447        };
1448        let allowed = Site {
1449            name: "Yes".into(),
1450            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1451            signals: vec![Signal::StatusFound { codes: vec![200] }],
1452            known_present: None,
1453            known_absent: None,
1454            extract: Vec::new(),
1455            tags: Vec::new(),
1456            request_headers: std::collections::BTreeMap::new(),
1457            regex_check: None,
1458            engine: None,
1459            strip_bad_char: None,
1460            request_method: crate::site::HttpMethod::Get,
1461            request_body: None,
1462            protection: Vec::new(),
1463            disabled: false,
1464            source: None,
1465            popularity: None,
1466            access: crate::AccessPolicy::default(),
1467        };
1468
1469        let no = client.check(&disallowed, &user()).await;
1470        assert_eq!(no.kind, MatchKind::Uncertain);
1471        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1472
1473        let yes = client.check(&allowed, &user()).await;
1474        assert_eq!(yes.kind, MatchKind::Found);
1475    }
1476
1477    #[tokio::test]
1478    async fn body_read_skipped_when_no_body_signal_needed() {
1479        // Mock returns body that would fail a body_absent check — but since
1480        // we only have a status signal, body is never read.
1481        let server = MockServer::start().await;
1482        Mock::given(any())
1483            .and(path("/alice"))
1484            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1485            .mount(&server)
1486            .await;
1487        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1488        let outcome = build_client().check(&site, &user()).await;
1489        assert_eq!(outcome.kind, MatchKind::Found);
1490    }
1491
1492    // ===== Browser routing =====
1493
1494    /// Test backend that returns a canned page and counts calls. Lets the
1495    /// routing tests assert "Client did/did not invoke the browser" without
1496    /// involving a real Chrome process.
1497    #[derive(Debug)]
1498    struct RecordingBackend {
1499        page: RenderedPage,
1500        calls: std::sync::atomic::AtomicUsize,
1501    }
1502
1503    impl RecordingBackend {
1504        fn with_page(page: RenderedPage) -> Self {
1505            Self {
1506                page,
1507                calls: std::sync::atomic::AtomicUsize::new(0),
1508            }
1509        }
1510        fn call_count(&self) -> usize {
1511            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1512        }
1513    }
1514
1515    #[async_trait::async_trait]
1516    impl BrowserBackend for RecordingBackend {
1517        async fn fetch(
1518            &self,
1519            _url: &url::Url,
1520            _headers: &std::collections::BTreeMap<String, String>,
1521            _timeout: Duration,
1522        ) -> Result<RenderedPage> {
1523            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1524            Ok(self.page.clone())
1525        }
1526    }
1527
1528    fn site_bot_protected(server: &MockServer) -> Site {
1529        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1530        s.tags = vec!["bot-protected".into()];
1531        s
1532    }
1533
1534    #[tokio::test]
1535    async fn browser_routes_bot_protected_sites() {
1536        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1537        // returns its canned page directly.
1538        let server = MockServer::start().await;
1539        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1540            status: 200,
1541            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1542            body: "<html></html>".into(),
1543            elapsed_ms: 42,
1544        }));
1545        let client = Client::builder()
1546            .min_request_interval(Duration::ZERO)
1547            .max_retries(0)
1548            .browser(backend.clone())
1549            .build()
1550            .unwrap();
1551        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1552        assert_eq!(outcome.kind, MatchKind::Found);
1553        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1554    }
1555
1556    #[tokio::test]
1557    async fn non_bot_protected_sites_skip_browser() {
1558        let server = MockServer::start().await;
1559        Mock::given(any())
1560            .and(path("/alice"))
1561            .respond_with(ResponseTemplate::new(200))
1562            .mount(&server)
1563            .await;
1564        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1565            status: 500, // would make wiremock case fail if browser was taken
1566            final_url: url::Url::parse("https://x/").unwrap(),
1567            body: String::new(),
1568            elapsed_ms: 0,
1569        }));
1570        let client = Client::builder()
1571            .min_request_interval(Duration::ZERO)
1572            .max_retries(0)
1573            .browser(backend.clone())
1574            .build()
1575            .unwrap();
1576        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1577        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1578        let outcome = client.check(&site, &user()).await;
1579        assert_eq!(outcome.kind, MatchKind::Found);
1580        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1581    }
1582
1583    #[tokio::test]
1584    async fn browser_budget_exhaust_yields_uncertain() {
1585        let server = MockServer::start().await;
1586        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1587            status: 200,
1588            final_url: url::Url::parse("https://x/").unwrap(),
1589            body: String::new(),
1590            elapsed_ms: 0,
1591        }));
1592        let client = Client::builder()
1593            .min_request_interval(Duration::ZERO)
1594            .max_retries(0)
1595            .browser(backend.clone())
1596            .browser_budget(1)
1597            .build()
1598            .unwrap();
1599        let site = site_bot_protected(&server);
1600        // First call consumes the only slot.
1601        let first = client.check(&site, &user()).await;
1602        assert_eq!(first.kind, MatchKind::Found);
1603        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1604        let second = client.check(&site, &user()).await;
1605        assert_eq!(second.kind, MatchKind::Uncertain);
1606        assert!(matches!(
1607            second.reason,
1608            Some(UncertainReason::BrowserBudget)
1609        ));
1610        assert_eq!(
1611            backend.call_count(),
1612            1,
1613            "second call must not invoke backend"
1614        );
1615    }
1616
1617    #[tokio::test]
1618    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1619        struct FailingBackend;
1620        #[async_trait::async_trait]
1621        impl BrowserBackend for FailingBackend {
1622            async fn fetch(
1623                &self,
1624                _url: &url::Url,
1625                _headers: &std::collections::BTreeMap<String, String>,
1626                _timeout: Duration,
1627            ) -> Result<RenderedPage> {
1628                Err(Error::BrowserSetup {
1629                    message: "simulated crash".into(),
1630                })
1631            }
1632        }
1633        impl std::fmt::Debug for FailingBackend {
1634            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1635                f.write_str("FailingBackend")
1636            }
1637        }
1638
1639        let server = MockServer::start().await;
1640        let client = Client::builder()
1641            .min_request_interval(Duration::ZERO)
1642            .max_retries(0)
1643            .browser(Arc::new(FailingBackend))
1644            .build()
1645            .unwrap();
1646        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1647        assert_eq!(outcome.kind, MatchKind::Uncertain);
1648        match outcome.reason {
1649            Some(UncertainReason::BrowserFailed(msg)) => {
1650                assert!(msg.contains("simulated crash"), "got: {msg}");
1651            }
1652            other => panic!("expected BrowserFailed, got {other:?}"),
1653        }
1654    }
1655
1656    #[tokio::test]
1657    async fn status_only_site_uses_head_request() {
1658        // Site with only status signals (no body markers, no enrichment)
1659        // should be probed with HEAD — saves the body download on
1660        // ~30% of the registry.
1661        let server = MockServer::start().await;
1662        Mock::given(method("HEAD"))
1663            .and(path("/alice"))
1664            .respond_with(ResponseTemplate::new(200))
1665            .mount(&server)
1666            .await;
1667        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1668        let outcome = build_client().check(&site, &user()).await;
1669        assert_eq!(outcome.kind, MatchKind::Found);
1670        let recvd = server.received_requests().await.unwrap_or_default();
1671        assert_eq!(recvd.len(), 1);
1672        assert_eq!(recvd[0].method.as_str(), "HEAD");
1673    }
1674
1675    #[tokio::test]
1676    async fn body_signal_site_uses_get_request() {
1677        // Same baseline plus a body-marker signal — must still GET so
1678        // the body actually arrives for matching.
1679        let server = MockServer::start().await;
1680        Mock::given(any())
1681            .and(path("/alice"))
1682            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
1683            .mount(&server)
1684            .await;
1685        let site = site_with(
1686            &server,
1687            vec![Signal::BodyPresent {
1688                text: "hello".into(),
1689            }],
1690        );
1691        let outcome = build_client().check(&site, &user()).await;
1692        assert_eq!(outcome.kind, MatchKind::Found);
1693        let recvd = server.received_requests().await.unwrap_or_default();
1694        assert_eq!(recvd[0].method.as_str(), "GET");
1695    }
1696
1697    #[tokio::test]
1698    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
1699        // A site that declares `protection: [Cloudflare]` but doesn't
1700        // carry the legacy `bot-protected` tag should still route
1701        // through the browser backend — the new structured field is
1702        // an additional signal, not a tag replacement.
1703        let server = MockServer::start().await;
1704        Mock::given(any())
1705            .respond_with(ResponseTemplate::new(200))
1706            .mount(&server)
1707            .await;
1708        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1709        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
1710        // No bot-protected tag — pure structured-field test.
1711        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1712            status: 200,
1713            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1714            body: String::new(),
1715            elapsed_ms: 0,
1716        }));
1717        let client = Client::builder()
1718            .min_request_interval(Duration::ZERO)
1719            .max_retries(0)
1720            .browser(backend)
1721            .build()
1722            .unwrap();
1723        let outcome = client.check(&site, &user()).await;
1724        // The recording backend always returns a synthetic 200, so
1725        // Found means we went through the browser path.
1726        assert_eq!(outcome.kind, MatchKind::Found);
1727        // No raw HTTP probe should have hit the mock server.
1728        let recvd = server.received_requests().await.unwrap_or_default();
1729        assert_eq!(
1730            recvd.len(),
1731            0,
1732            "structured protection must skip the raw HTTP path"
1733        );
1734    }
1735
1736    #[tokio::test]
1737    async fn post_method_sends_body_with_username_substituted() {
1738        // A POST-probed site (e.g. Anilist GraphQL) — the username
1739        // goes in the body, not the URL. Adler should substitute
1740        // `{username}` and send a POST with the rendered payload.
1741        let server = MockServer::start().await;
1742        Mock::given(method("POST"))
1743            .and(path("/api"))
1744            .respond_with(ResponseTemplate::new(200))
1745            .mount(&server)
1746            .await;
1747        // URL substitution still requires the `{username}` placeholder,
1748        // even for POST sites where the username also lives in the
1749        // body. Most real POST endpoints encode the username in both
1750        // (e.g. query string + body); we mirror that.
1751        let site = Site {
1752            name: "ApiPost".into(),
1753            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
1754            signals: vec![Signal::StatusFound { codes: vec![200] }],
1755            known_present: None,
1756            known_absent: None,
1757            extract: Vec::new(),
1758            tags: Vec::new(),
1759            request_headers: std::collections::BTreeMap::new(),
1760            regex_check: None,
1761            engine: None,
1762            strip_bad_char: None,
1763            request_method: HttpMethod::Post,
1764            request_body: Some(r#"{"name":"{username}"}"#.into()),
1765            protection: Vec::new(),
1766            disabled: false,
1767            source: None,
1768            popularity: None,
1769            access: crate::AccessPolicy::default(),
1770        };
1771        let outcome = build_client().check(&site, &user()).await;
1772        assert_eq!(outcome.kind, MatchKind::Found);
1773        let recvd = server.received_requests().await.unwrap_or_default();
1774        assert_eq!(recvd.len(), 1);
1775        assert_eq!(recvd[0].method.as_str(), "POST");
1776        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
1777        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
1778    }
1779
1780    #[tokio::test]
1781    async fn head_405_falls_back_to_get() {
1782        // A server that rejects HEAD with 405 — Adler should silently
1783        // retry with GET so the optimisation can never cost accuracy.
1784        let server = MockServer::start().await;
1785        Mock::given(method("HEAD"))
1786            .and(path("/alice"))
1787            .respond_with(ResponseTemplate::new(405))
1788            .mount(&server)
1789            .await;
1790        Mock::given(any())
1791            .and(path("/alice"))
1792            .respond_with(ResponseTemplate::new(200))
1793            .mount(&server)
1794            .await;
1795        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1796        let outcome = build_client().check(&site, &user()).await;
1797        assert_eq!(outcome.kind, MatchKind::Found);
1798        let recvd = server.received_requests().await.unwrap_or_default();
1799        assert_eq!(recvd.len(), 2);
1800        assert_eq!(recvd[0].method.as_str(), "HEAD");
1801        assert_eq!(recvd[1].method.as_str(), "GET");
1802    }
1803}