Skip to main content

adler_core/client/
mod.rs

1//! HTTP client wrapping `reqwest`, plus the per-site probe entry point.
2//!
3//! The wrapper exists to keep `reqwest` out of Adler's public API surface.
4//! All knobs that future modules need (timeouts, redirect policy, user agent)
5//! are configured through [`ClientBuilder`]; per-request transient failures
6//! never bubble up as errors — they become
7//! [`MatchKind::Uncertain`](crate::MatchKind::Uncertain) on the returned
8//! outcome.
9
10use std::fmt;
11use std::sync::Arc;
12use std::time::Duration;
13
14use crate::access::{EgressPool, SessionStore};
15use crate::browser::{BrowserBackend, BrowserBudget};
16use crate::retry::RetryPolicy;
17use crate::robots::RobotsCache;
18use crate::throttle::HostThrottle;
19use crate::transport::HttpFetcher;
20#[cfg(feature = "impersonate")]
21use crate::transport::ImpersonateFetcher;
22
23const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
24const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
25const DEFAULT_REDIRECT_LIMIT: usize = 8;
26const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
27/// Single fixed key for the global rate limiter (it gates all hosts).
28const GLOBAL_THROTTLE_KEY: &str = "*global*";
29
30/// HTTP client used to probe sites.
31///
32/// Cheap to clone — the underlying `reqwest::Client` is reference-counted
33/// internally, and the throttle is `Arc`-backed, so cloning is the
34/// recommended way to share a client between tasks. Cloned clients share
35/// throttle state, which is what you want: a fan-out scan must not
36/// accidentally exceed a per-host budget by spawning more clients.
37#[derive(Clone)]
38pub struct Client {
39    http: Arc<HttpFetcher>,
40    /// Geo / IP-type egress pool for sites whose `access` policy needs a
41    /// specific proxy. Empty by default → every site uses `http`.
42    egress: Arc<EgressPool>,
43    /// Operator-supplied sessions, keyed by the name a site references
44    /// via `access.session`. Empty by default.
45    sessions: Arc<SessionStore>,
46    throttle: HostThrottle,
47    /// Global RPS cap applied across all hosts. `None` → uncapped.
48    global_throttle: Option<HostThrottle>,
49    retry: RetryPolicy,
50    /// Optional rotation pool. Empty → use the client's fixed User-Agent.
51    /// `Arc<[String]>` so cloning a client per task stays cheap.
52    user_agents: Arc<[String]>,
53    /// Extract profile fields from `Found` pages that declare extractors.
54    enrich: bool,
55    /// When set, skip probes disallowed by the host's `robots.txt`.
56    robots: Option<RobotsCache>,
57    /// Browser backend used for `bot-protected` sites. `None` → those sites
58    /// stay on the raw HTTP path and typically end up `Uncertain`.
59    browser: Option<Arc<dyn BrowserBackend>>,
60    /// TLS-fingerprint-impersonating HTTP client (`wreq`). Built when
61    /// the `impersonate` Cargo feature is on; routes sites whose
62    /// `protection` is exactly `TlsFingerprint`.
63    #[cfg(feature = "impersonate")]
64    impersonate: Option<Arc<ImpersonateFetcher>>,
65    /// Per-scan cap on browser fetches. Shared across `Client::check` calls
66    /// for a single scan, so several tasks compete for the same budget.
67    browser_budget: Arc<BrowserBudget>,
68    /// Per-scan cap on *automatic escalations* from a cheap transport to
69    /// the browser when the cheap path returns
70    /// `Uncertain(CloudflareChallenge | RateLimited)`. Independent of
71    /// `browser_budget` so the pre-tagged `bot-protected` subset and the
72    /// long-tail escalation subset don't fight over the same number.
73    escalation_budget: Arc<crate::escalation::EscalationBudget>,
74    /// Whether automatic escalation runs at all. `false` keeps the cheap
75    /// transport's outcome verbatim — useful for benchmarking the raw
76    /// signals without the access-engine lift on top.
77    escalation_enabled: bool,
78}
79
80impl Client {
81    /// Start configuring a new client.
82    pub fn builder() -> ClientBuilder {
83        ClientBuilder::default()
84    }
85
86    /// Read-only view of the configured egress pool — `(country, kind)`
87    /// for every registered proxy, in the order they were declared.
88    /// Proxy URLs are not surfaced (they typically carry credentials),
89    /// so this is safe to serialise to a JSON response.
90    #[must_use]
91    pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
92        self.egress.summary()
93    }
94
95    /// Names of the configured sessions (sorted lexicographically),
96    /// without any header values. Useful for a UI listing which session
97    /// keys an operator can reference via `access.session` on a site.
98    #[must_use]
99    pub fn session_names(&self) -> Vec<String> {
100        self.sessions.names()
101    }
102
103    /// Names of the configured egresses (in registration order, only
104    /// those that supplied a name). Used by the server to validate
105    /// per-scan `egress_names` against the loaded pool.
106    #[must_use]
107    pub fn egress_names(&self) -> Vec<String> {
108        self.egress.names()
109    }
110
111    /// Returns a new client identical to this one except its egress
112    /// pool is restricted to entries whose `name` matches one of
113    /// `names`. An empty `names` slice is treated as "no filter" and
114    /// returns a clone of the full pool.
115    ///
116    /// Cheap to call repeatedly: all shared state (HTTP clients,
117    /// throttle, sessions, budgets, browser backend, …) is
118    /// `Arc`-cloned so the returned client shares the parent's
119    /// per-scan caps (browser budget, escalation budget, throttle
120    /// state) rather than each subset getting a fresh one. This is the
121    /// right behaviour for a single web-server instance handing out
122    /// per-request clients.
123    #[must_use]
124    pub fn with_egress_subset(&self, names: &[String]) -> Self {
125        Self {
126            http: Arc::clone(&self.http),
127            egress: Arc::new(self.egress.subset(names)),
128            sessions: Arc::clone(&self.sessions),
129            throttle: self.throttle.clone(),
130            global_throttle: self.global_throttle.clone(),
131            retry: self.retry.clone(),
132            user_agents: Arc::clone(&self.user_agents),
133            enrich: self.enrich,
134            robots: self.robots.clone(),
135            browser: self.browser.clone(),
136            #[cfg(feature = "impersonate")]
137            impersonate: self.impersonate.clone(),
138            browser_budget: Arc::clone(&self.browser_budget),
139            escalation_budget: Arc::clone(&self.escalation_budget),
140            escalation_enabled: self.escalation_enabled,
141        }
142    }
143}
144
145/// Raw response data returned by [`Client::fetch`] for diagnostics.
146#[derive(Debug, Clone)]
147pub struct RawResponse {
148    /// HTTP status code.
149    pub status: u16,
150    /// Final URL after redirects.
151    pub final_url: String,
152    /// Decoded response body.
153    pub body: String,
154}
155
156impl fmt::Debug for Client {
157    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158        f.debug_struct("Client")
159            .field("throttle", &self.throttle)
160            .field("global_throttle", &self.global_throttle)
161            .field("retry", &self.retry)
162            .field("user_agents", &self.user_agents)
163            .field("enrich", &self.enrich)
164            .field("robots", &self.robots.is_some())
165            .field("browser", &self.browser.is_some())
166            .field("browser_budget", &self.browser_budget)
167            .field("escalation_budget", &self.escalation_budget)
168            .field("escalation_enabled", &self.escalation_enabled)
169            .finish_non_exhaustive()
170    }
171}
172
173/// Registry tag marking a site as bot-protected.
174///
175/// Set on sites behind Cloudflare, `PerimeterX`, datadome,
176/// `hCaptcha`, etc. The routing layer treats it as a hint that
177/// residential egress is likely required; the doctor and
178/// registry-summary surfaces use it to annotate honest-limit audits.
179/// Tags are compared with [`str::eq_ignore_ascii_case`].
180pub const BOT_PROTECTED_TAG: &str = "bot-protected";
181
182mod builder;
183mod probe;
184mod util;
185pub use builder::{ClientBuilder, DEFAULT_BROWSER_BUDGET, DEFAULT_ESCALATION_BUDGET};
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190    use crate::browser::RenderedPage;
191    use crate::check::{MatchKind, UncertainReason};
192    use crate::confidence::ConfidenceReason;
193    use crate::error::{Error, Result};
194    use crate::profile::{EvidenceOrigin, ProfileEvidenceKind};
195    use crate::site::{Extractor, HttpMethod, ProtectionKind, Signal, Site, UrlTemplate};
196    use crate::username::Username;
197    use std::time::Instant;
198    use wiremock::matchers::{any, method, path};
199    use wiremock::{Mock, MockServer, ResponseTemplate};
200
201    use crate::test_fixtures::{default_site, test_client};
202
203    fn build_client() -> Client {
204        test_client()
205    }
206
207    fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
208        let mut s = default_site("Mock", &format!("{}/{{username}}", server.uri()));
209        s.signals = signals;
210        s
211    }
212
213    fn user() -> Username {
214        Username::new("alice").unwrap()
215    }
216
217    #[tokio::test]
218    async fn regex_check_short_circuits_before_any_request() {
219        // Stand up a mock that would 200 on *anything* — if probe_once
220        // failed to short-circuit on regex mismatch, the username
221        // "alice" (5 chars) would resolve to Found here.
222        let server = MockServer::start().await;
223        Mock::given(any())
224            .respond_with(ResponseTemplate::new(200))
225            .mount(&server)
226            .await;
227        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
228        // The site only accepts usernames of 8+ chars; "alice" is 5.
229        site.regex_check = Some("^[A-Za-z]{8,}$".into());
230        let outcome = build_client().check(&site, &user()).await;
231        assert_eq!(outcome.kind, MatchKind::Uncertain);
232        assert!(
233            matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
234            "expected UsernameNotAllowed, got {:?}",
235            outcome.reason,
236        );
237        // No request should have hit the mock — assert by counting
238        // received_requests on the wiremock server.
239        let recvd = server.received_requests().await.unwrap_or_default();
240        assert_eq!(
241            recvd.len(),
242            0,
243            "regex_check mismatch must skip the HTTP request entirely"
244        );
245    }
246
247    #[tokio::test]
248    async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
249        // A mock that would 200 on anything — if the geo gate failed to
250        // short-circuit, "alice" would resolve to Found here.
251        let server = MockServer::start().await;
252        Mock::given(any())
253            .respond_with(ResponseTemplate::new(200))
254            .mount(&server)
255            .await;
256        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
257        // Require a Polish egress; the default client has no egress pool,
258        // so nothing can satisfy it.
259        site.access = crate::access::AccessPolicy {
260            geo: vec![crate::access::CountryCode::new("pl").unwrap()],
261            ..crate::access::AccessPolicy::default()
262        };
263        let outcome = build_client().check(&site, &user()).await;
264        assert_eq!(outcome.kind, MatchKind::Uncertain);
265        assert!(
266            matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
267            "expected GeoUnavailable, got {:?}",
268            outcome.reason,
269        );
270        // The site must NOT have been probed — an unreachable geo is not
271        // evidence of absence, and we don't fetch from the wrong location.
272        let recvd = server.received_requests().await.unwrap_or_default();
273        assert_eq!(
274            recvd.len(),
275            0,
276            "geo-unavailable must skip the HTTP request entirely"
277        );
278    }
279
280    #[tokio::test]
281    async fn session_headers_are_sent_on_probe() {
282        // Only respond 200 when the request carries the session cookie,
283        // so a Found verdict proves the header was actually applied.
284        let server = MockServer::start().await;
285        Mock::given(any())
286            .and(wiremock::matchers::header("cookie", "sessionid=real"))
287            .respond_with(ResponseTemplate::new(200))
288            .mount(&server)
289            .await;
290        let mut headers = std::collections::BTreeMap::new();
291        headers.insert("Cookie".to_string(), "sessionid=real".to_string());
292        let mut store = SessionStore::new();
293        store.insert("acct", crate::access::Session::from_headers(headers));
294        let client = Client::builder()
295            .timeout(Duration::from_secs(2))
296            .min_request_interval(Duration::ZERO)
297            .max_retries(0)
298            .sessions(store)
299            .build()
300            .expect("client builds");
301        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
302        site.access.session = Some("acct".to_string());
303        let outcome = client.check(&site, &user()).await;
304        assert_eq!(
305            outcome.kind,
306            MatchKind::Found,
307            "session cookie should unlock the 200 (got {:?})",
308            outcome.reason,
309        );
310    }
311
312    #[tokio::test]
313    async fn live_enriched_result_stamps_evidence_access_metadata() {
314        let server = MockServer::start().await;
315        Mock::given(any())
316            .respond_with(
317                ResponseTemplate::new(200)
318                    .set_body_string(r#"<html><h1 class="name">Alice Example</h1></html>"#),
319            )
320            .mount(&server)
321            .await;
322        let client = Client::builder()
323            .timeout(Duration::from_secs(2))
324            .min_request_interval(Duration::ZERO)
325            .max_retries(0)
326            .enrich(true)
327            .build()
328            .expect("client builds");
329        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
330        site.extract = vec![Extractor {
331            field: "name".to_owned(),
332            selector: "h1.name".to_owned(),
333            attr: None,
334        }];
335
336        let outcome = client.check(&site, &user()).await;
337
338        assert_eq!(outcome.kind, MatchKind::Found);
339        assert_eq!(outcome.profile_evidence.len(), 1);
340        let source = &outcome.profile_evidence[0].source;
341        assert!(source.observed_at_ms.is_some());
342        let access = source.access_path.as_ref().expect("access metadata");
343        assert_eq!(access.transport, crate::escalation::TransportTier::Http);
344        assert!(!access.escalated);
345        assert!(!access.authenticated);
346        assert!(!access.session_required);
347    }
348
349    #[tokio::test]
350    async fn authenticated_enriched_result_marks_authenticated_without_session_name() {
351        let server = MockServer::start().await;
352        Mock::given(any())
353            .and(wiremock::matchers::header("cookie", "sessionid=real"))
354            .respond_with(
355                ResponseTemplate::new(200)
356                    .set_body_string(r#"<html><h1 class="name">Alice Example</h1></html>"#),
357            )
358            .mount(&server)
359            .await;
360        let mut headers = std::collections::BTreeMap::new();
361        headers.insert("Cookie".to_string(), "sessionid=real".to_string());
362        let mut store = SessionStore::new();
363        store.insert("acct", crate::access::Session::from_headers(headers));
364        let client = Client::builder()
365            .timeout(Duration::from_secs(2))
366            .min_request_interval(Duration::ZERO)
367            .max_retries(0)
368            .sessions(store)
369            .enrich(true)
370            .build()
371            .expect("client builds");
372        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
373        site.access.session = Some("acct".to_owned());
374        site.extract = vec![Extractor {
375            field: "name".to_owned(),
376            selector: "h1.name".to_owned(),
377            attr: None,
378        }];
379
380        let outcome = client.check(&site, &user()).await;
381
382        assert_eq!(outcome.kind, MatchKind::Found);
383        let evidence = outcome.profile_evidence.first().expect("profile evidence");
384        let access = evidence
385            .source
386            .access_path
387            .as_ref()
388            .expect("access metadata");
389        assert!(access.authenticated);
390        let encoded = serde_json::to_string(evidence).unwrap();
391        assert!(!encoded.contains("acct"));
392        assert!(!encoded.contains("sessionid=real"));
393    }
394
395    #[tokio::test]
396    async fn missing_named_session_is_session_required() {
397        let server = MockServer::start().await;
398        Mock::given(any())
399            .respond_with(ResponseTemplate::new(200))
400            .mount(&server)
401            .await;
402        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
403        // Names a session the (empty) store doesn't have.
404        site.access.session = Some("not-configured".to_string());
405        let outcome = build_client().check(&site, &user()).await;
406        assert_eq!(outcome.kind, MatchKind::Uncertain);
407        assert!(
408            matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
409            "expected SessionRequired, got {:?}",
410            outcome.reason,
411        );
412        let recvd = server.received_requests().await.unwrap_or_default();
413        assert_eq!(
414            recvd.len(),
415            0,
416            "a missing session must skip the request, not probe unauthenticated"
417        );
418    }
419
420    #[cfg(feature = "impersonate")]
421    #[tokio::test]
422    async fn impersonate_routes_pure_tls_fingerprint_site() {
423        let server = MockServer::start().await;
424        Mock::given(any())
425            .respond_with(ResponseTemplate::new(200))
426            .mount(&server)
427            .await;
428        let client = Client::builder()
429            .timeout(Duration::from_secs(2))
430            .min_request_interval(Duration::ZERO)
431            .max_retries(0)
432            .build()
433            .expect("client builds with impersonate");
434        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
435        // Pure TLS-fingerprint protection — exactly the shape that
436        // routes to the impersonate fetcher.
437        site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
438        let outcome = client.check(&site, &user()).await;
439        assert_eq!(
440            outcome.kind,
441            MatchKind::Found,
442            "expected Found (reason {:?})",
443            outcome.reason,
444        );
445        // wreq's Chrome-134 emulation sets a Chrome-shaped User-Agent —
446        // observable proof that the request came from the impersonate
447        // path and not the default `adler/<version>` HTTP fetcher.
448        let recvd = server.received_requests().await.expect("received requests");
449        assert_eq!(recvd.len(), 1, "expected exactly one request");
450        let ua = recvd[0]
451            .headers
452            .get("user-agent")
453            .and_then(|v| v.to_str().ok())
454            .unwrap_or("");
455        assert!(
456            ua.contains("Chrome/"),
457            "expected Chrome-shaped UA from wreq, got {ua:?}"
458        );
459    }
460
461    #[tokio::test]
462    async fn regex_check_pass_proceeds_to_probe() {
463        let server = MockServer::start().await;
464        Mock::given(any())
465            .and(path("/alice"))
466            .respond_with(ResponseTemplate::new(200))
467            .mount(&server)
468            .await;
469        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
470        // Pattern that matches "alice".
471        site.regex_check = Some("^[a-z]{3,}$".into());
472        let outcome = build_client().check(&site, &user()).await;
473        assert_eq!(outcome.kind, MatchKind::Found);
474    }
475
476    #[tokio::test]
477    async fn status_signal_reports_found_on_match() {
478        let server = MockServer::start().await;
479        Mock::given(any())
480            .and(path("/alice"))
481            .respond_with(ResponseTemplate::new(200))
482            .mount(&server)
483            .await;
484        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
485        let outcome = build_client().check(&site, &user()).await;
486        assert_eq!(outcome.kind, MatchKind::Found);
487        assert!(outcome.url.ends_with("/alice"));
488        assert!(outcome.reason.is_none());
489        assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
490    }
491
492    #[tokio::test]
493    async fn body_username_signal_creates_exact_username_evidence_without_enrich() {
494        let server = MockServer::start().await;
495        Mock::given(any())
496            .and(path("/johndoe"))
497            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"handle":"johndoe"}"#))
498            .mount(&server)
499            .await;
500        let mut site = site_with(
501            &server,
502            vec![Signal::BodyUsername {
503                text: r#""handle":"{username}""#.into(),
504            }],
505        );
506        site.strip_bad_char = Some(".".into());
507
508        let outcome = build_client()
509            .check(&site, &Username::new("john.doe").unwrap())
510            .await;
511
512        assert_eq!(outcome.kind, MatchKind::Found);
513        assert!(outcome.enrichment.is_empty());
514        assert_eq!(outcome.profile_evidence.len(), 1);
515        let evidence = &outcome.profile_evidence[0];
516        assert_eq!(evidence.kind, ProfileEvidenceKind::Username);
517        assert_eq!(evidence.field, None);
518        assert_eq!(evidence.value, "johndoe");
519        assert_eq!(evidence.source.origin, EvidenceOrigin::Signal);
520        assert!(evidence.source.observed_at_ms.is_some());
521        assert!(
522            evidence
523                .source
524                .access_path
525                .as_ref()
526                .is_some_and(|path| path.transport == crate::TransportTier::Http)
527        );
528        assert!(
529            outcome
530                .confidence
531                .reasons
532                .iter()
533                .any(|reason| matches!(reason, ConfidenceReason::ExactUsernameMatch { count: 1 }))
534        );
535    }
536
537    #[tokio::test]
538    async fn generic_body_present_does_not_create_username_evidence() {
539        let server = MockServer::start().await;
540        Mock::given(any())
541            .and(path("/alice"))
542            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"username":"alice"}"#))
543            .mount(&server)
544            .await;
545        let site = site_with(
546            &server,
547            vec![Signal::BodyPresent {
548                text: "username".into(),
549            }],
550        );
551
552        let outcome = build_client().check(&site, &user()).await;
553
554        assert_eq!(outcome.kind, MatchKind::Found);
555        assert!(outcome.profile_evidence.is_empty());
556        assert!(
557            !outcome
558                .confidence
559                .reasons
560                .iter()
561                .any(|reason| matches!(reason, ConfidenceReason::ExactUsernameMatch { .. }))
562        );
563    }
564
565    #[tokio::test]
566    async fn status_signal_pair_reports_not_found_on_404() {
567        let server = MockServer::start().await;
568        Mock::given(any())
569            .and(path("/alice"))
570            .respond_with(ResponseTemplate::new(404))
571            .mount(&server)
572            .await;
573        let site = site_with(
574            &server,
575            vec![
576                Signal::StatusFound { codes: vec![200] },
577                Signal::StatusNotFound { codes: vec![404] },
578            ],
579        );
580        let outcome = build_client().check(&site, &user()).await;
581        assert_eq!(outcome.kind, MatchKind::NotFound);
582        // Only the NotFound-voting signal is cited as evidence.
583        assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
584    }
585
586    #[tokio::test]
587    async fn conflicting_not_found_does_not_attach_username_evidence() {
588        let server = MockServer::start().await;
589        Mock::given(any())
590            .and(path("/alice"))
591            .respond_with(
592                ResponseTemplate::new(200)
593                    .set_body_string(r#"{"username":"alice","error":"missing"}"#),
594            )
595            .mount(&server)
596            .await;
597        let site = site_with(
598            &server,
599            vec![
600                Signal::BodyUsername {
601                    text: r#""username":"{username}""#.into(),
602                },
603                Signal::BodyAbsent {
604                    text: r#""error":"missing""#.into(),
605                },
606            ],
607        );
608
609        let outcome = build_client().check(&site, &user()).await;
610
611        assert_eq!(outcome.kind, MatchKind::NotFound);
612        assert!(outcome.profile_evidence.is_empty());
613    }
614
615    #[tokio::test]
616    async fn body_absent_signal_detects_missing_account() {
617        let server = MockServer::start().await;
618        Mock::given(any())
619            .and(path("/alice"))
620            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
621            .mount(&server)
622            .await;
623        let site = site_with(
624            &server,
625            vec![Signal::BodyAbsent {
626                text: "Profile not found".into(),
627            }],
628        );
629        let outcome = build_client().check(&site, &user()).await;
630        assert_eq!(outcome.kind, MatchKind::NotFound);
631    }
632
633    #[tokio::test]
634    async fn body_absent_alone_yields_uncertain_when_marker_missing() {
635        // Phase 2 semantics: absence of an absence-marker is not evidence
636        // of presence — it just means we have no signal that fired.
637        let server = MockServer::start().await;
638        Mock::given(any())
639            .and(path("/alice"))
640            .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
641            .mount(&server)
642            .await;
643        let site = site_with(
644            &server,
645            vec![Signal::BodyAbsent {
646                text: "Profile not found".into(),
647            }],
648        );
649        let outcome = build_client().check(&site, &user()).await;
650        assert_eq!(outcome.kind, MatchKind::Uncertain);
651    }
652
653    #[tokio::test]
654    async fn body_present_plus_absent_resolve_to_found() {
655        let server = MockServer::start().await;
656        Mock::given(any())
657            .and(path("/alice"))
658            .respond_with(
659                ResponseTemplate::new(200)
660                    .set_body_string(r#"<div class="profile-card">alice</div>"#),
661            )
662            .mount(&server)
663            .await;
664        let site = site_with(
665            &server,
666            vec![
667                Signal::BodyPresent {
668                    text: "profile-card".into(),
669                },
670                Signal::BodyAbsent {
671                    text: "Profile not found".into(),
672                },
673            ],
674        );
675        let outcome = build_client().check(&site, &user()).await;
676        assert_eq!(outcome.kind, MatchKind::Found);
677    }
678
679    #[tokio::test]
680    async fn redirect_absent_signal_detects_missing_account() {
681        let server = MockServer::start().await;
682        Mock::given(any())
683            .and(path("/alice"))
684            .respond_with(
685                ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
686            )
687            .mount(&server)
688            .await;
689        Mock::given(any())
690            .and(path("/login"))
691            .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
692            .mount(&server)
693            .await;
694        let site = site_with(
695            &server,
696            vec![Signal::RedirectAbsent {
697                fragment: "/login".into(),
698            }],
699        );
700        let outcome = build_client().check(&site, &user()).await;
701        assert_eq!(outcome.kind, MatchKind::NotFound);
702    }
703
704    #[tokio::test]
705    async fn negative_signal_wins_over_positive() {
706        // StatusFound votes Found (200 matches); BodyAbsent votes NotFound
707        // (error marker appears). Negative-priority aggregation → NotFound.
708        // This is the canonical Sherlock "message" pattern: a site that
709        // returns 200 for everyone and differentiates via an error string.
710        let server = MockServer::start().await;
711        Mock::given(any())
712            .and(path("/alice"))
713            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
714            .mount(&server)
715            .await;
716        let site = site_with(
717            &server,
718            vec![
719                Signal::StatusFound { codes: vec![200] },
720                Signal::BodyAbsent {
721                    text: "Profile not found".into(),
722                },
723            ],
724        );
725        let outcome = build_client().check(&site, &user()).await;
726        assert_eq!(outcome.kind, MatchKind::NotFound);
727    }
728
729    #[tokio::test]
730    async fn network_failure_yields_uncertain() {
731        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
732        let port = listener.local_addr().unwrap().port();
733        drop(listener);
734
735        let site = Site {
736            name: "Dead".into(),
737            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
738            signals: vec![Signal::StatusFound { codes: vec![200] }],
739            known_present: None,
740            known_absent: None,
741            extract: Vec::new(),
742            tags: Vec::new(),
743            request_headers: std::collections::BTreeMap::new(),
744            regex_check: None,
745            engine: None,
746            strip_bad_char: None,
747            request_method: crate::site::HttpMethod::Get,
748            request_body: None,
749            protection: Vec::new(),
750            disabled: false,
751            disabled_reason: None,
752            source: None,
753            popularity: None,
754            access: crate::AccessPolicy::default(),
755        };
756        let client = Client::builder()
757            .timeout(Duration::from_millis(500))
758            .connect_timeout(Duration::from_millis(500))
759            .max_retries(0)
760            .build()
761            .unwrap();
762        let outcome = client.check(&site, &user()).await;
763        assert_eq!(outcome.kind, MatchKind::Uncertain);
764        assert!(outcome.reason.is_some());
765    }
766
767    #[tokio::test]
768    async fn throttle_spaces_consecutive_calls_to_same_host() {
769        let server = MockServer::start().await;
770        Mock::given(any())
771            .and(path("/alice"))
772            .respond_with(ResponseTemplate::new(200))
773            .mount(&server)
774            .await;
775        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
776        // Interval is intentionally much larger than typical wiremock latency
777        // (≤10 ms locally, can spike under heavy parallel test load). Any
778        // value too close to HTTP latency would let the first request burn
779        // through the throttle window and make the assertion flaky.
780        let client = Client::builder()
781            .timeout(Duration::from_secs(2))
782            .min_request_interval(Duration::from_millis(300))
783            .build()
784            .unwrap();
785
786        client.check(&site, &user()).await;
787        let started = Instant::now();
788        client.check(&site, &user()).await;
789        let elapsed = started.elapsed();
790        assert!(
791            elapsed >= Duration::from_millis(200),
792            "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
793        );
794    }
795
796    #[tokio::test]
797    async fn builder_overrides_user_agent() {
798        let server = MockServer::start().await;
799        Mock::given(any())
800            .and(path("/alice"))
801            .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
802            .respond_with(ResponseTemplate::new(200))
803            .mount(&server)
804            .await;
805        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
806        let client = Client::builder()
807            .user_agent("adler-test/1.0")
808            .build()
809            .unwrap();
810        let outcome = client.check(&site, &user()).await;
811        assert_eq!(outcome.kind, MatchKind::Found);
812    }
813
814    #[tokio::test]
815    async fn rate_limit_429_yields_uncertain_with_note() {
816        let server = MockServer::start().await;
817        Mock::given(any())
818            .and(path("/alice"))
819            .respond_with(ResponseTemplate::new(429))
820            .mount(&server)
821            .await;
822        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
823        let outcome = build_client().check(&site, &user()).await;
824        assert_eq!(outcome.kind, MatchKind::Uncertain);
825        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
826    }
827
828    #[tokio::test]
829    async fn cloudflare_server_header_yields_uncertain() {
830        let server = MockServer::start().await;
831        Mock::given(any())
832            .and(path("/alice"))
833            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
834            .mount(&server)
835            .await;
836        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
837        let outcome = build_client().check(&site, &user()).await;
838        assert_eq!(outcome.kind, MatchKind::Uncertain);
839        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
840    }
841
842    #[tokio::test]
843    async fn cloudflare_interstitial_in_body_yields_uncertain() {
844        // Body-based ban detection only runs when a signal already needs
845        // the body — this site uses BodyAbsent so the body is read.
846        let server = MockServer::start().await;
847        Mock::given(any())
848            .and(path("/alice"))
849            .respond_with(
850                ResponseTemplate::new(200)
851                    .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
852            )
853            .mount(&server)
854            .await;
855        let site = site_with(
856            &server,
857            vec![Signal::BodyAbsent {
858                text: "Profile not found".into(),
859            }],
860        );
861        let outcome = build_client().check(&site, &user()).await;
862        assert_eq!(outcome.kind, MatchKind::Uncertain);
863        assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
864    }
865
866    #[tokio::test]
867    async fn ban_detection_does_not_fire_on_legitimate_403() {
868        let server = MockServer::start().await;
869        Mock::given(any())
870            .and(path("/alice"))
871            .respond_with(ResponseTemplate::new(403))
872            .mount(&server)
873            .await;
874        let site = site_with(
875            &server,
876            vec![
877                Signal::StatusFound { codes: vec![200] },
878                Signal::StatusNotFound { codes: vec![403] },
879            ],
880        );
881        let outcome = build_client().check(&site, &user()).await;
882        // 403 is ambiguous for bans; site explicitly maps it to NotFound.
883        assert_eq!(outcome.kind, MatchKind::NotFound);
884        assert!(outcome.reason.is_none());
885    }
886
887    #[tokio::test]
888    async fn retry_recovers_after_transient_429() {
889        let server = MockServer::start().await;
890        // First request: 429. Subsequent: 200.
891        Mock::given(any())
892            .and(path("/alice"))
893            .respond_with(ResponseTemplate::new(429))
894            .up_to_n_times(1)
895            .mount(&server)
896            .await;
897        Mock::given(any())
898            .and(path("/alice"))
899            .respond_with(ResponseTemplate::new(200))
900            .mount(&server)
901            .await;
902        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
903        let client = Client::builder()
904            .timeout(Duration::from_secs(2))
905            .min_request_interval(Duration::ZERO)
906            .max_retries(2)
907            .base_backoff_delay(Duration::from_millis(20))
908            .max_backoff_delay(Duration::from_millis(100))
909            .build()
910            .unwrap();
911        let outcome = client.check(&site, &user()).await;
912        assert_eq!(outcome.kind, MatchKind::Found);
913        assert!(outcome.reason.is_none());
914    }
915
916    #[tokio::test]
917    async fn retry_exhausts_and_returns_uncertain() {
918        let server = MockServer::start().await;
919        Mock::given(any())
920            .and(path("/alice"))
921            .respond_with(ResponseTemplate::new(429))
922            .mount(&server)
923            .await;
924        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
925        let client = Client::builder()
926            .timeout(Duration::from_secs(2))
927            .min_request_interval(Duration::ZERO)
928            .max_retries(2)
929            .base_backoff_delay(Duration::from_millis(10))
930            .max_backoff_delay(Duration::from_millis(50))
931            .build()
932            .unwrap();
933        let outcome = client.check(&site, &user()).await;
934        assert_eq!(outcome.kind, MatchKind::Uncertain);
935        assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
936    }
937
938    #[tokio::test]
939    async fn retry_does_not_fire_on_network_error() {
940        // Connection refused → Uncertain note starts with "request:", not a
941        // ban marker. We must NOT retry — otherwise a single dead site
942        // burns the full backoff budget before reporting.
943        let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
944        let port = listener.local_addr().unwrap().port();
945        drop(listener);
946        let site = Site {
947            name: "Dead".into(),
948            url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
949            signals: vec![Signal::StatusFound { codes: vec![200] }],
950            known_present: None,
951            known_absent: None,
952            extract: Vec::new(),
953            tags: Vec::new(),
954            request_headers: std::collections::BTreeMap::new(),
955            regex_check: None,
956            engine: None,
957            strip_bad_char: None,
958            request_method: crate::site::HttpMethod::Get,
959            request_body: None,
960            protection: Vec::new(),
961            disabled: false,
962            disabled_reason: None,
963            source: None,
964            popularity: None,
965            access: crate::AccessPolicy::default(),
966        };
967        let client = Client::builder()
968            .timeout(Duration::from_millis(500))
969            .connect_timeout(Duration::from_millis(500))
970            .min_request_interval(Duration::ZERO)
971            .max_retries(3)
972            .base_backoff_delay(Duration::from_secs(60))
973            .build()
974            .unwrap();
975        let started = Instant::now();
976        let outcome = client.check(&site, &user()).await;
977        // If retry fired, we'd be sleeping minutes; instead this returns
978        // promptly with an Uncertain.
979        assert!(started.elapsed() < Duration::from_secs(5));
980        assert_eq!(outcome.kind, MatchKind::Uncertain);
981        assert!(
982            matches!(outcome.reason, Some(UncertainReason::Network(_))),
983            "got {:?}",
984            outcome.reason,
985        );
986    }
987
988    #[tokio::test]
989    async fn rotates_user_agent_per_request() {
990        // The mock only matches when the request carries one of the pooled
991        // UAs; if rotation weren't applied, the default adler/x.y UA would
992        // miss and the verdict would be NotFound.
993        let server = MockServer::start().await;
994        Mock::given(any())
995            .and(path("/alice"))
996            .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
997            .respond_with(ResponseTemplate::new(200))
998            .mount(&server)
999            .await;
1000        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1001        let client = Client::builder()
1002            .min_request_interval(Duration::ZERO)
1003            .max_retries(0)
1004            .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1005            .build()
1006            .unwrap();
1007        let outcome = client.check(&site, &user()).await;
1008        assert_eq!(outcome.kind, MatchKind::Found);
1009    }
1010
1011    #[test]
1012    fn invalid_proxy_url_fails_build() {
1013        let err = Client::builder().proxy("not a url").build().unwrap_err();
1014        assert!(matches!(err, Error::HttpSetup { .. }));
1015    }
1016
1017    #[test]
1018    fn schemeless_proxy_is_rejected_up_front() {
1019        // reqwest would silently treat this as a host; we require a scheme.
1020        let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1021        let Error::HttpSetup { message } = err else {
1022            panic!("expected HttpSetup, got {err:?}");
1023        };
1024        assert!(message.contains("must start with"), "{message}");
1025    }
1026
1027    #[test]
1028    fn socks5_proxy_scheme_is_accepted() {
1029        // Valid scheme + endpoint builds fine (no connection is attempted).
1030        assert!(
1031            Client::builder()
1032                .proxy("socks5://127.0.0.1:9050")
1033                .build()
1034                .is_ok()
1035        );
1036    }
1037
1038    #[tokio::test]
1039    async fn global_rps_cap_spaces_requests_across_hosts() {
1040        // Two distinct host paths; per-host throttle is disabled, so any
1041        // spacing must come from the global RPS cap. 5 RPS → 200 ms apart.
1042        let server = MockServer::start().await;
1043        Mock::given(any())
1044            .respond_with(ResponseTemplate::new(200))
1045            .mount(&server)
1046            .await;
1047        let site_a = Site {
1048            name: "A".into(),
1049            url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1050            signals: vec![Signal::StatusFound { codes: vec![200] }],
1051            known_present: None,
1052            known_absent: None,
1053            extract: Vec::new(),
1054            tags: Vec::new(),
1055            request_headers: std::collections::BTreeMap::new(),
1056            regex_check: None,
1057            engine: None,
1058            strip_bad_char: None,
1059            request_method: crate::site::HttpMethod::Get,
1060            request_body: None,
1061            protection: Vec::new(),
1062            disabled: false,
1063            disabled_reason: None,
1064            source: None,
1065            popularity: None,
1066            access: crate::AccessPolicy::default(),
1067        };
1068        let site_b = Site {
1069            name: "B".into(),
1070            url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1071            signals: vec![Signal::StatusFound { codes: vec![200] }],
1072            known_present: None,
1073            known_absent: None,
1074            extract: Vec::new(),
1075            tags: Vec::new(),
1076            request_headers: std::collections::BTreeMap::new(),
1077            regex_check: None,
1078            engine: None,
1079            strip_bad_char: None,
1080            request_method: crate::site::HttpMethod::Get,
1081            request_body: None,
1082            protection: Vec::new(),
1083            disabled: false,
1084            disabled_reason: None,
1085            source: None,
1086            popularity: None,
1087            access: crate::AccessPolicy::default(),
1088        };
1089        // 2 RPS → ~500 ms between requests. A large interval keeps the
1090        // assertion robust even when the first probe's own duration (which
1091        // eats into the measured gap) is inflated by test instrumentation
1092        // such as coverage tooling.
1093        let client = Client::builder()
1094            .min_request_interval(Duration::ZERO)
1095            .max_retries(0)
1096            .max_rps(std::num::NonZeroU32::new(2).unwrap())
1097            .build()
1098            .unwrap();
1099        // First request consumes the slot at t≈0; second waits ~500 ms even
1100        // though it targets a different host.
1101        client.check(&site_a, &user()).await;
1102        let started = Instant::now();
1103        client.check(&site_b, &user()).await;
1104        assert!(
1105            started.elapsed() >= Duration::from_millis(350),
1106            "global cap should space cross-host requests, got {:?}",
1107            started.elapsed(),
1108        );
1109    }
1110
1111    #[tokio::test]
1112    async fn respect_robots_skips_disallowed_paths() {
1113        let server = MockServer::start().await;
1114        Mock::given(any())
1115            .and(path("/robots.txt"))
1116            .respond_with(
1117                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1118            )
1119            .mount(&server)
1120            .await;
1121        Mock::given(any())
1122            .and(path("/no/alice"))
1123            .respond_with(ResponseTemplate::new(200))
1124            .mount(&server)
1125            .await;
1126        Mock::given(any())
1127            .and(path("/yes/alice"))
1128            .respond_with(ResponseTemplate::new(200))
1129            .mount(&server)
1130            .await;
1131        let client = Client::builder()
1132            .min_request_interval(Duration::ZERO)
1133            .max_retries(0)
1134            .respect_robots(true)
1135            .build()
1136            .unwrap();
1137
1138        let disallowed = Site {
1139            name: "No".into(),
1140            url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1141            signals: vec![Signal::StatusFound { codes: vec![200] }],
1142            known_present: None,
1143            known_absent: None,
1144            extract: Vec::new(),
1145            tags: Vec::new(),
1146            request_headers: std::collections::BTreeMap::new(),
1147            regex_check: None,
1148            engine: None,
1149            strip_bad_char: None,
1150            request_method: crate::site::HttpMethod::Get,
1151            request_body: None,
1152            protection: Vec::new(),
1153            disabled: false,
1154            disabled_reason: None,
1155            source: None,
1156            popularity: None,
1157            access: crate::AccessPolicy::default(),
1158        };
1159        let allowed = Site {
1160            name: "Yes".into(),
1161            url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1162            signals: vec![Signal::StatusFound { codes: vec![200] }],
1163            known_present: None,
1164            known_absent: None,
1165            extract: Vec::new(),
1166            tags: Vec::new(),
1167            request_headers: std::collections::BTreeMap::new(),
1168            regex_check: None,
1169            engine: None,
1170            strip_bad_char: None,
1171            request_method: crate::site::HttpMethod::Get,
1172            request_body: None,
1173            protection: Vec::new(),
1174            disabled: false,
1175            disabled_reason: None,
1176            source: None,
1177            popularity: None,
1178            access: crate::AccessPolicy::default(),
1179        };
1180
1181        let no = client.check(&disallowed, &user()).await;
1182        assert_eq!(no.kind, MatchKind::Uncertain);
1183        assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1184
1185        let yes = client.check(&allowed, &user()).await;
1186        assert_eq!(yes.kind, MatchKind::Found);
1187    }
1188
1189    #[tokio::test]
1190    async fn body_read_skipped_when_no_body_signal_needed() {
1191        // Mock returns body that would fail a body_absent check — but since
1192        // we only have a status signal, body is never read.
1193        let server = MockServer::start().await;
1194        Mock::given(any())
1195            .and(path("/alice"))
1196            .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1197            .mount(&server)
1198            .await;
1199        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1200        let outcome = build_client().check(&site, &user()).await;
1201        assert_eq!(outcome.kind, MatchKind::Found);
1202    }
1203
1204    // ===== Browser routing =====
1205
1206    /// Test backend that returns a canned page and counts calls. Lets the
1207    /// routing tests assert "Client did/did not invoke the browser" without
1208    /// involving a real Chrome process.
1209    #[derive(Debug)]
1210    struct RecordingBackend {
1211        page: RenderedPage,
1212        calls: std::sync::atomic::AtomicUsize,
1213    }
1214
1215    impl RecordingBackend {
1216        fn with_page(page: RenderedPage) -> Self {
1217            Self {
1218                page,
1219                calls: std::sync::atomic::AtomicUsize::new(0),
1220            }
1221        }
1222        fn call_count(&self) -> usize {
1223            self.calls.load(std::sync::atomic::Ordering::SeqCst)
1224        }
1225    }
1226
1227    #[async_trait::async_trait]
1228    impl BrowserBackend for RecordingBackend {
1229        async fn fetch(
1230            &self,
1231            _url: &url::Url,
1232            _headers: &std::collections::BTreeMap<String, String>,
1233            _timeout: Duration,
1234        ) -> Result<RenderedPage> {
1235            self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1236            Ok(self.page.clone())
1237        }
1238    }
1239
1240    fn site_bot_protected(server: &MockServer) -> Site {
1241        let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1242        s.tags = vec![BOT_PROTECTED_TAG.into()];
1243        s
1244    }
1245
1246    #[tokio::test]
1247    async fn browser_routes_bot_protected_sites() {
1248        // wiremock would *not* fire (raw HTTP path is skipped) — the backend
1249        // returns its canned page directly.
1250        let server = MockServer::start().await;
1251        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1252            status: 200,
1253            final_url: url::Url::parse("https://example.com/alice").unwrap(),
1254            body: "<html></html>".into(),
1255            elapsed_ms: 42,
1256        }));
1257        let client = Client::builder()
1258            .min_request_interval(Duration::ZERO)
1259            .max_retries(0)
1260            .browser(backend.clone())
1261            .build()
1262            .unwrap();
1263        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1264        assert_eq!(outcome.kind, MatchKind::Found);
1265        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1266    }
1267
1268    #[tokio::test]
1269    async fn non_bot_protected_sites_skip_browser() {
1270        let server = MockServer::start().await;
1271        Mock::given(any())
1272            .and(path("/alice"))
1273            .respond_with(ResponseTemplate::new(200))
1274            .mount(&server)
1275            .await;
1276        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1277            status: 500, // would make wiremock case fail if browser was taken
1278            final_url: url::Url::parse("https://x/").unwrap(),
1279            body: String::new(),
1280            elapsed_ms: 0,
1281        }));
1282        let client = Client::builder()
1283            .min_request_interval(Duration::ZERO)
1284            .max_retries(0)
1285            .browser(backend.clone())
1286            .build()
1287            .unwrap();
1288        // site WITHOUT bot-protected tag → must go via raw HTTP (wiremock).
1289        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1290        let outcome = client.check(&site, &user()).await;
1291        assert_eq!(outcome.kind, MatchKind::Found);
1292        assert_eq!(backend.call_count(), 0, "browser must not be touched");
1293    }
1294
1295    #[tokio::test]
1296    async fn browser_budget_exhaust_yields_uncertain() {
1297        let server = MockServer::start().await;
1298        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1299            status: 200,
1300            final_url: url::Url::parse("https://x/").unwrap(),
1301            body: String::new(),
1302            elapsed_ms: 0,
1303        }));
1304        let client = Client::builder()
1305            .min_request_interval(Duration::ZERO)
1306            .max_retries(0)
1307            .browser(backend.clone())
1308            .browser_budget(1)
1309            .build()
1310            .unwrap();
1311        let site = site_bot_protected(&server);
1312        // First call consumes the only slot.
1313        let first = client.check(&site, &user()).await;
1314        assert_eq!(first.kind, MatchKind::Found);
1315        // Second call hits the cap → Uncertain(BrowserBudget), backend NOT invoked.
1316        let second = client.check(&site, &user()).await;
1317        assert_eq!(second.kind, MatchKind::Uncertain);
1318        assert!(matches!(
1319            second.reason,
1320            Some(UncertainReason::BrowserBudget)
1321        ));
1322        assert_eq!(
1323            backend.call_count(),
1324            1,
1325            "second call must not invoke backend"
1326        );
1327    }
1328
1329    #[tokio::test]
1330    async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1331        struct FailingBackend;
1332        #[async_trait::async_trait]
1333        impl BrowserBackend for FailingBackend {
1334            async fn fetch(
1335                &self,
1336                _url: &url::Url,
1337                _headers: &std::collections::BTreeMap<String, String>,
1338                _timeout: Duration,
1339            ) -> Result<RenderedPage> {
1340                Err(Error::BrowserSetup {
1341                    message: "simulated crash".into(),
1342                })
1343            }
1344        }
1345        impl std::fmt::Debug for FailingBackend {
1346            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1347                f.write_str("FailingBackend")
1348            }
1349        }
1350
1351        let server = MockServer::start().await;
1352        let client = Client::builder()
1353            .min_request_interval(Duration::ZERO)
1354            .max_retries(0)
1355            .browser(Arc::new(FailingBackend))
1356            .build()
1357            .unwrap();
1358        let outcome = client.check(&site_bot_protected(&server), &user()).await;
1359        assert_eq!(outcome.kind, MatchKind::Uncertain);
1360        match outcome.reason {
1361            Some(UncertainReason::BrowserFailed(msg)) => {
1362                assert!(msg.contains("simulated crash"), "got: {msg}");
1363            }
1364            other => panic!("expected BrowserFailed, got {other:?}"),
1365        }
1366    }
1367
1368    #[tokio::test]
1369    async fn status_only_site_uses_head_request() {
1370        // Site with only status signals (no body markers, no enrichment)
1371        // should be probed with HEAD — saves the body download on
1372        // ~30% of the registry.
1373        let server = MockServer::start().await;
1374        Mock::given(method("HEAD"))
1375            .and(path("/alice"))
1376            .respond_with(ResponseTemplate::new(200))
1377            .mount(&server)
1378            .await;
1379        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1380        let outcome = build_client().check(&site, &user()).await;
1381        assert_eq!(outcome.kind, MatchKind::Found);
1382        let recvd = server.received_requests().await.unwrap_or_default();
1383        assert_eq!(recvd.len(), 1);
1384        assert_eq!(recvd[0].method.as_str(), "HEAD");
1385    }
1386
1387    #[tokio::test]
1388    async fn body_signal_site_uses_get_request() {
1389        // Same baseline plus a body-marker signal — must still GET so
1390        // the body actually arrives for matching.
1391        let server = MockServer::start().await;
1392        Mock::given(any())
1393            .and(path("/alice"))
1394            .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
1395            .mount(&server)
1396            .await;
1397        let site = site_with(
1398            &server,
1399            vec![Signal::BodyPresent {
1400                text: "hello".into(),
1401            }],
1402        );
1403        let outcome = build_client().check(&site, &user()).await;
1404        assert_eq!(outcome.kind, MatchKind::Found);
1405        let recvd = server.received_requests().await.unwrap_or_default();
1406        assert_eq!(recvd[0].method.as_str(), "GET");
1407    }
1408
1409    #[tokio::test]
1410    async fn protection_field_routes_through_browser_like_bot_protected_tag() {
1411        // A site that declares `protection: [Cloudflare]` but doesn't
1412        // carry the legacy `bot-protected` tag should still route
1413        // through the browser backend — the new structured field is
1414        // an additional signal, not a tag replacement.
1415        let server = MockServer::start().await;
1416        Mock::given(any())
1417            .respond_with(ResponseTemplate::new(200))
1418            .mount(&server)
1419            .await;
1420        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1421        site.protection = vec![crate::site::ProtectionKind::Cloudflare];
1422        // No bot-protected tag — pure structured-field test.
1423        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1424            status: 200,
1425            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1426            body: String::new(),
1427            elapsed_ms: 0,
1428        }));
1429        let client = Client::builder()
1430            .min_request_interval(Duration::ZERO)
1431            .max_retries(0)
1432            .browser(backend)
1433            .build()
1434            .unwrap();
1435        let outcome = client.check(&site, &user()).await;
1436        // The recording backend always returns a synthetic 200, so
1437        // Found means we went through the browser path.
1438        assert_eq!(outcome.kind, MatchKind::Found);
1439        // No raw HTTP probe should have hit the mock server.
1440        let recvd = server.received_requests().await.unwrap_or_default();
1441        assert_eq!(
1442            recvd.len(),
1443            0,
1444            "structured protection must skip the raw HTTP path"
1445        );
1446    }
1447
1448    #[tokio::test]
1449    async fn user_auth_protection_alone_uses_http_session_path() {
1450        let server = MockServer::start().await;
1451        Mock::given(any())
1452            .and(path("/alice"))
1453            .respond_with(ResponseTemplate::new(200))
1454            .mount(&server)
1455            .await;
1456        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1457            status: 500,
1458            final_url: url::Url::parse("https://x/").unwrap(),
1459            body: String::new(),
1460            elapsed_ms: 0,
1461        }));
1462        let client = Client::builder()
1463            .min_request_interval(Duration::ZERO)
1464            .max_retries(0)
1465            .browser(backend.clone())
1466            .build()
1467            .unwrap();
1468        let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1469        site.protection = vec![ProtectionKind::UserAuth];
1470
1471        let outcome = client.check(&site, &user()).await;
1472
1473        assert_eq!(outcome.kind, MatchKind::Found);
1474        assert_eq!(
1475            backend.call_count(),
1476            0,
1477            "user-auth alone must not invoke browser"
1478        );
1479        let recvd = server.received_requests().await.unwrap_or_default();
1480        assert_eq!(recvd.len(), 1, "user-auth alone should use raw HTTP");
1481    }
1482
1483    #[tokio::test]
1484    async fn post_method_sends_body_with_username_substituted() {
1485        // A POST-probed site (e.g. Anilist GraphQL) — the username
1486        // goes in the body, not the URL. Adler should substitute
1487        // `{username}` and send a POST with the rendered payload.
1488        let server = MockServer::start().await;
1489        Mock::given(method("POST"))
1490            .and(path("/api"))
1491            .respond_with(ResponseTemplate::new(200))
1492            .mount(&server)
1493            .await;
1494        // URL substitution still requires the `{username}` placeholder,
1495        // even for POST sites where the username also lives in the
1496        // body. Most real POST endpoints encode the username in both
1497        // (e.g. query string + body); we mirror that.
1498        let site = Site {
1499            name: "ApiPost".into(),
1500            url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
1501            signals: vec![Signal::StatusFound { codes: vec![200] }],
1502            known_present: None,
1503            known_absent: None,
1504            extract: Vec::new(),
1505            tags: Vec::new(),
1506            request_headers: std::collections::BTreeMap::new(),
1507            regex_check: None,
1508            engine: None,
1509            strip_bad_char: None,
1510            request_method: HttpMethod::Post,
1511            request_body: Some(r#"{"name":"{username}"}"#.into()),
1512            protection: Vec::new(),
1513            disabled: false,
1514            disabled_reason: None,
1515            source: None,
1516            popularity: None,
1517            access: crate::AccessPolicy::default(),
1518        };
1519        let outcome = build_client().check(&site, &user()).await;
1520        assert_eq!(outcome.kind, MatchKind::Found);
1521        let recvd = server.received_requests().await.unwrap_or_default();
1522        assert_eq!(recvd.len(), 1);
1523        assert_eq!(recvd[0].method.as_str(), "POST");
1524        let body = String::from_utf8_lossy(&recvd[0].body).to_string();
1525        assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
1526    }
1527
1528    #[tokio::test]
1529    async fn head_405_falls_back_to_get() {
1530        // A server that rejects HEAD with 405 — Adler should silently
1531        // retry with GET so the optimisation can never cost accuracy.
1532        let server = MockServer::start().await;
1533        Mock::given(method("HEAD"))
1534            .and(path("/alice"))
1535            .respond_with(ResponseTemplate::new(405))
1536            .mount(&server)
1537            .await;
1538        Mock::given(any())
1539            .and(path("/alice"))
1540            .respond_with(ResponseTemplate::new(200))
1541            .mount(&server)
1542            .await;
1543        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1544        let outcome = build_client().check(&site, &user()).await;
1545        assert_eq!(outcome.kind, MatchKind::Found);
1546        let recvd = server.received_requests().await.unwrap_or_default();
1547        assert_eq!(recvd.len(), 2);
1548        assert_eq!(recvd[0].method.as_str(), "HEAD");
1549        assert_eq!(recvd[1].method.as_str(), "GET");
1550    }
1551
1552    // ------------------------------------------------------------------
1553    // Phase 4 — automatic escalation when the cheap transport hits a
1554    // Cloudflare / rate-limit Uncertain that the browser could resolve.
1555    // ------------------------------------------------------------------
1556
1557    /// Mocked HTTP that always responds with a Cloudflare 503 (server
1558    /// header + 503 status — what the pre-body ban detector turns into
1559    /// `Uncertain(CloudflareChallenge)`).
1560    async fn cloudflare_503_server() -> MockServer {
1561        let server = MockServer::start().await;
1562        Mock::given(any())
1563            .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1564            .mount(&server)
1565            .await;
1566        server
1567    }
1568
1569    #[tokio::test]
1570    async fn http_success_stamps_http_transport_no_escalations() {
1571        let server = MockServer::start().await;
1572        Mock::given(any())
1573            .respond_with(ResponseTemplate::new(200))
1574            .mount(&server)
1575            .await;
1576        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1577        let outcome = build_client().check(&site, &user()).await;
1578        assert_eq!(outcome.kind, MatchKind::Found);
1579        assert_eq!(
1580            outcome.transport,
1581            Some(crate::escalation::TransportTier::Http),
1582            "successful HTTP probe must stamp Http transport"
1583        );
1584        assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
1585    }
1586
1587    #[tokio::test]
1588    async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
1589        let server = cloudflare_503_server().await;
1590        // Browser returns a 200 that the StatusFound signal turns into Found.
1591        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1592            status: 200,
1593            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1594            body: String::new(),
1595            elapsed_ms: 5,
1596        }));
1597        let client = Client::builder()
1598            .min_request_interval(Duration::ZERO)
1599            .max_retries(0)
1600            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
1601            .build()
1602            .unwrap();
1603        // Non-bot-protected site — HTTP path runs first, hits Cloudflare,
1604        // escalation routes to the browser, browser's 200 → Found.
1605        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1606        let outcome = client.check(&site, &user()).await;
1607        assert_eq!(
1608            outcome.kind,
1609            MatchKind::Found,
1610            "escalation should flip CF challenge to Found via browser (reason {:?})",
1611            outcome.reason
1612        );
1613        assert_eq!(
1614            outcome.transport,
1615            Some(crate::escalation::TransportTier::Browser),
1616            "escalated outcome must be stamped Browser"
1617        );
1618        assert_eq!(
1619            outcome.escalations, 1,
1620            "exactly one escalation should have fired"
1621        );
1622        assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1623    }
1624
1625    #[tokio::test]
1626    async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
1627        let server = cloudflare_503_server().await;
1628        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1629            status: 200,
1630            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1631            body: String::new(),
1632            elapsed_ms: 0,
1633        }));
1634        let client = Client::builder()
1635            .min_request_interval(Duration::ZERO)
1636            .max_retries(0)
1637            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
1638            .disable_escalation()
1639            .build()
1640            .unwrap();
1641        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1642        let outcome = client.check(&site, &user()).await;
1643        assert_eq!(outcome.kind, MatchKind::Uncertain);
1644        assert!(matches!(
1645            outcome.reason,
1646            Some(UncertainReason::CloudflareChallenge)
1647        ));
1648        assert_eq!(
1649            outcome.transport,
1650            Some(crate::escalation::TransportTier::Http),
1651            "primary transport must still be stamped"
1652        );
1653        assert_eq!(outcome.escalations, 0);
1654        assert_eq!(
1655            backend.call_count(),
1656            0,
1657            "browser must not be touched when --no-escalation"
1658        );
1659    }
1660
1661    #[tokio::test]
1662    async fn escalation_budget_zero_keeps_browser_untouched() {
1663        let server = cloudflare_503_server().await;
1664        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1665            status: 200,
1666            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1667            body: String::new(),
1668            elapsed_ms: 0,
1669        }));
1670        let client = Client::builder()
1671            .min_request_interval(Duration::ZERO)
1672            .max_retries(0)
1673            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
1674            .escalation_budget(0)
1675            .build()
1676            .unwrap();
1677        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1678        let outcome = client.check(&site, &user()).await;
1679        assert_eq!(outcome.kind, MatchKind::Uncertain);
1680        assert!(matches!(
1681            outcome.reason,
1682            Some(UncertainReason::CloudflareChallenge)
1683        ));
1684        assert_eq!(outcome.escalations, 0);
1685        assert_eq!(
1686            backend.call_count(),
1687            0,
1688            "zero budget must deny every escalation"
1689        );
1690    }
1691
1692    #[tokio::test]
1693    async fn escalation_consumes_budget_then_stops() {
1694        let server = cloudflare_503_server().await;
1695        let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1696            status: 200,
1697            final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
1698            body: String::new(),
1699            elapsed_ms: 0,
1700        }));
1701        let client = Client::builder()
1702            .min_request_interval(Duration::ZERO)
1703            .max_retries(0)
1704            .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
1705            .escalation_budget(1)
1706            .build()
1707            .unwrap();
1708        let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1709        // First call burns the only escalation slot.
1710        let first = client.check(&site, &user()).await;
1711        assert_eq!(first.kind, MatchKind::Found);
1712        assert_eq!(first.escalations, 1);
1713        // Second call's escalation is denied → cheap-path Uncertain survives.
1714        let second = client.check(&site, &user()).await;
1715        assert_eq!(second.kind, MatchKind::Uncertain);
1716        assert!(matches!(
1717            second.reason,
1718            Some(UncertainReason::CloudflareChallenge)
1719        ));
1720        assert_eq!(second.escalations, 0);
1721        assert_eq!(backend.call_count(), 1, "browser called exactly once total");
1722    }
1723}