1use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41#[derive(Clone)]
49pub struct Client {
50 http: Arc<HttpFetcher>,
51 egress: Arc<EgressPool>,
54 sessions: Arc<SessionStore>,
57 throttle: HostThrottle,
58 global_throttle: Option<HostThrottle>,
60 retry: RetryPolicy,
61 user_agents: Arc<[String]>,
64 enrich: bool,
66 robots: Option<RobotsCache>,
68 browser: Option<Arc<dyn BrowserBackend>>,
71 #[cfg(feature = "impersonate")]
75 impersonate: Option<Arc<ImpersonateFetcher>>,
76 browser_budget: Arc<BrowserBudget>,
79 escalation_budget: Arc<crate::escalation::EscalationBudget>,
85 escalation_enabled: bool,
89}
90
91impl Client {
92 pub fn builder() -> ClientBuilder {
94 ClientBuilder::default()
95 }
96
97 #[must_use]
102 pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
103 self.egress.summary()
104 }
105
106 #[must_use]
110 pub fn session_names(&self) -> Vec<String> {
111 self.sessions.names()
112 }
113
114 #[must_use]
118 pub fn egress_names(&self) -> Vec<String> {
119 self.egress.names()
120 }
121
122 #[must_use]
135 pub fn with_egress_subset(&self, names: &[String]) -> Self {
136 Self {
137 http: Arc::clone(&self.http),
138 egress: Arc::new(self.egress.subset(names)),
139 sessions: Arc::clone(&self.sessions),
140 throttle: self.throttle.clone(),
141 global_throttle: self.global_throttle.clone(),
142 retry: self.retry.clone(),
143 user_agents: Arc::clone(&self.user_agents),
144 enrich: self.enrich,
145 robots: self.robots.clone(),
146 browser: self.browser.clone(),
147 #[cfg(feature = "impersonate")]
148 impersonate: self.impersonate.clone(),
149 browser_budget: Arc::clone(&self.browser_budget),
150 escalation_budget: Arc::clone(&self.escalation_budget),
151 escalation_enabled: self.escalation_enabled,
152 }
153 }
154
155 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
169 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
170 let mut attempt: u32 = 0;
171 loop {
172 let outcome = self.probe_once(site, username).await;
173 if !retry::should_retry(&outcome, attempt, &self.retry) {
174 return outcome;
175 }
176 let delay = retry::backoff_delay(attempt, &self.retry);
177 tracing::info!(
178 site = %site.name,
179 attempt = attempt + 1,
180 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
181 ?delay,
182 "transient ban, retrying",
183 );
184 tokio::time::sleep(delay).await;
185 attempt += 1;
186 }
187 }
188
189 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
198 let host = host_of(url);
199 if let Some(global) = &self.global_throttle {
200 global.wait(GLOBAL_THROTTLE_KEY).await;
201 }
202 self.throttle.wait(&host).await;
203 let mut request = self.http.client().get(url);
204 if let Some(ua) = self.pick_user_agent() {
205 request = request.header(reqwest::header::USER_AGENT, ua);
206 }
207 let response = request.send().await.ok()?;
208 let status = response.status().as_u16();
209 let final_url = response.url().to_string();
210 let body = response.text().await.unwrap_or_default();
211 Some(RawResponse {
212 status,
213 final_url,
214 body,
215 })
216 }
217
218 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
229 if let Some(backend) = self.browser.as_deref() {
230 let has_tag = site
231 .tags
232 .iter()
233 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
234 if has_tag || !site.protection.is_empty() {
235 let parsed = url::Url::parse(url).ok()?;
236 match backend
237 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
238 .await
239 {
240 Ok(page) => {
241 return Some(RawResponse {
242 status: page.status,
243 final_url: page.final_url.to_string(),
244 body: page.body,
245 });
246 }
247 Err(err) => {
248 tracing::warn!(
249 site = %site.name, %url, error = %err,
250 "browser fetch failed in doctor; falling back to raw HTTP",
251 );
252 }
253 }
254 }
255 }
256 self.fetch(url).await
257 }
258
259 fn pick_user_agent(&self) -> Option<&str> {
262 match self.user_agents.len() {
263 0 => None,
264 1 => Some(&self.user_agents[0]),
265 n => Some(&self.user_agents[fastrand::usize(0..n)]),
266 }
267 }
268
269 #[allow(clippy::too_many_lines)]
272 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
273 let url = site.url_for(username);
274
275 if let Some(pat) = &site.regex_check {
285 if let Ok(re) = regex::Regex::new(pat) {
286 if !re.is_match(username.as_str()) {
287 return uncertain(
288 &site.name,
289 url,
290 Instant::now(),
291 UncertainReason::UsernameNotAllowed,
292 );
293 }
294 }
295 }
296
297 let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
304 None => Cow::Borrowed(&site.request_headers),
305 Some(name) => match self.sessions.get(name) {
306 Some(session) => Cow::Owned(session.apply(&site.request_headers)),
307 None => {
308 return uncertain(
309 &site.name,
310 url,
311 Instant::now(),
312 UncertainReason::SessionRequired,
313 );
314 }
315 },
316 };
317 let headers: &BTreeMap<String, String> = &session_headers;
318
319 if let Some(backend) = &self.browser {
326 let has_tag = site
327 .tags
328 .iter()
329 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
330 if has_tag || !site.protection.is_empty() {
331 if self.browser_budget.try_consume() {
332 let started = Instant::now();
333 let req = FetchRequest {
334 method: site.request_method,
335 url: &url,
336 body: None,
337 user_agent: None,
338 headers,
339 want_body: true,
340 };
341 let fetcher = BrowserFetcher::new(Arc::clone(backend));
342 let mut outcome = match fetcher.fetch(&req).await {
343 Ok(resp) => self.finish(site, url, started, &resp),
344 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
345 };
346 outcome.transport = Some(crate::escalation::TransportTier::Browser);
347 return outcome;
348 }
349 tracing::warn!(site = %site.name, "browser budget exhausted");
350 let mut outcome = uncertain(
351 &site.name,
352 url,
353 Instant::now(),
354 UncertainReason::BrowserBudget,
355 );
356 outcome.transport = Some(crate::escalation::TransportTier::Browser);
357 return outcome;
358 }
359 }
360
361 #[cfg(feature = "impersonate")]
368 if let Some(fetcher) = &self.impersonate {
369 let pure_tls = site.protection.len() == 1
370 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
371 && !site
372 .tags
373 .iter()
374 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
375 if pure_tls {
376 let started = Instant::now();
377 let req = FetchRequest {
378 method: site.request_method,
379 url: &url,
380 body: None,
381 user_agent: self.pick_user_agent(),
382 headers,
383 want_body: true,
384 };
385 let mut primary = match fetcher.fetch(&req).await {
386 Ok(resp) => self.finish(site, url.clone(), started, &resp),
387 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
388 };
389 primary.transport = Some(crate::escalation::TransportTier::Impersonate);
390 return self.maybe_escalate(site, &url, headers, primary).await;
391 }
392 }
393
394 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
401 EgressChoice::Default => Arc::clone(&self.http),
402 EgressChoice::Use(fetcher) => fetcher,
403 EgressChoice::Unavailable => {
404 return uncertain(
405 &site.name,
406 url,
407 Instant::now(),
408 UncertainReason::GeoUnavailable,
409 );
410 }
411 };
412
413 let host = host_of(&url);
414
415 if let Some(robots) = &self.robots {
417 if let Some((origin, path)) = origin_and_path(&url) {
418 if !robots.allowed(&origin, &path).await {
419 tracing::debug!(%url, "skipped by robots.txt");
420 return uncertain(
421 &site.name,
422 url,
423 Instant::now(),
424 UncertainReason::RobotsDisallowed,
425 );
426 }
427 }
428 }
429
430 if let Some(global) = &self.global_throttle {
432 global.wait(GLOBAL_THROTTLE_KEY).await;
433 }
434 self.throttle.wait(&host).await;
435 let started = Instant::now();
436 tracing::debug!(%url, %host, "probing");
437
438 let want_enrich = self.enrich && !site.extract.is_empty();
441 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
442
443 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
448 const USERNAME_PH: &str = "{username}";
449 site.request_body
450 .as_deref()
451 .map(|t| t.replace(USERNAME_PH, username.as_str()))
452 } else {
453 None
454 };
455
456 let req = FetchRequest {
457 method: site.request_method,
458 url: &url,
459 body: body_for_post.as_deref(),
460 user_agent: self.pick_user_agent(),
461 headers,
462 want_body: needs_body,
463 };
464 let mut primary = match egress.fetch(&req).await {
465 Ok(resp) => self.finish(site, url.clone(), started, &resp),
466 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
467 };
468 primary.transport = Some(crate::escalation::TransportTier::Http);
469 self.maybe_escalate(site, &url, headers, primary).await
470 }
471
472 async fn maybe_escalate(
477 &self,
478 site: &Site,
479 url: &str,
480 headers: &BTreeMap<String, String>,
481 primary: CheckOutcome,
482 ) -> CheckOutcome {
483 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
484 return primary;
485 }
486 let Some(reason) = &primary.reason else {
487 return primary;
488 };
489 if !crate::escalation::should_escalate(reason) {
490 return primary;
491 }
492 let Some(backend) = &self.browser else {
493 return primary;
494 };
495 if !self.escalation_budget.try_consume() {
496 tracing::debug!(site = %site.name, "escalation budget exhausted");
497 return primary;
498 }
499
500 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
501 let started = Instant::now();
502 let req = FetchRequest {
503 method: site.request_method,
504 url,
505 body: None,
506 user_agent: None,
507 headers,
508 want_body: true,
509 };
510 let fetcher = BrowserFetcher::new(Arc::clone(backend));
511 let mut escalated = match fetcher.fetch(&req).await {
512 Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
513 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
514 };
515 escalated.transport = Some(crate::escalation::TransportTier::Browser);
516 escalated.escalations = 1;
517 escalated
518 }
519
520 fn finish(
524 &self,
525 site: &Site,
526 url: String,
527 started: Instant,
528 resp: &crate::transport::FetchResponse,
529 ) -> CheckOutcome {
530 let probe = Probe {
531 status: resp.status,
532 final_url: &resp.final_url,
533 body: &resp.body,
534 };
535 let votes: Vec<(&Signal, SignalVerdict)> = site
536 .signals
537 .iter()
538 .map(|s| (s, s.evaluate(&probe)))
539 .collect();
540 let kind = aggregate(votes.iter().map(|(_, v)| *v));
541 let mut result = outcome(&site.name, url, started, kind);
542 let winning = match kind {
544 MatchKind::Found => Some(SignalVerdict::Found),
545 MatchKind::NotFound => Some(SignalVerdict::NotFound),
546 MatchKind::Uncertain => None,
547 };
548 if let Some(want) = winning {
549 result.evidence = votes
550 .iter()
551 .filter(|(_, v)| *v == want)
552 .map(|(s, _)| s.describe_match(&probe))
553 .collect();
554 }
555 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
556 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
557 }
558 result
559 }
560}
561
562#[derive(Debug, Clone)]
564pub struct RawResponse {
565 pub status: u16,
567 pub final_url: String,
569 pub body: String,
571}
572
573#[derive(Clone)]
575#[must_use = "ClientBuilder does nothing until `.build()` is called"]
576#[allow(clippy::struct_excessive_bools)]
581pub struct ClientBuilder {
582 timeout: Duration,
583 connect_timeout: Duration,
584 user_agent: String,
585 follow_redirects: bool,
586 redirect_limit: usize,
587 min_request_interval: Duration,
588 max_rps: Option<NonZeroU32>,
589 retry: RetryPolicy,
590 proxy: Option<String>,
591 user_agents: Vec<String>,
592 enrich: bool,
593 respect_robots: bool,
594 browser: Option<Arc<dyn BrowserBackend>>,
595 browser_budget: usize,
596 egress: Vec<EgressSpec>,
597 sessions: SessionStore,
598 escalation_budget: usize,
599 escalation_enabled: bool,
600}
601
602impl Default for ClientBuilder {
603 fn default() -> Self {
604 Self {
605 timeout: DEFAULT_TIMEOUT,
606 connect_timeout: DEFAULT_CONNECT_TIMEOUT,
607 user_agent: default_user_agent(),
608 follow_redirects: true,
609 redirect_limit: DEFAULT_REDIRECT_LIMIT,
610 min_request_interval: DEFAULT_PER_HOST_INTERVAL,
611 max_rps: None,
612 retry: RetryPolicy::default(),
613 proxy: None,
614 user_agents: Vec::new(),
615 enrich: false,
616 respect_robots: false,
617 browser: None,
618 browser_budget: DEFAULT_BROWSER_BUDGET,
619 egress: Vec::new(),
620 sessions: SessionStore::new(),
621 escalation_budget: DEFAULT_ESCALATION_BUDGET,
622 escalation_enabled: true,
623 }
624 }
625}
626
627impl ClientBuilder {
628 pub fn timeout(mut self, timeout: Duration) -> Self {
630 self.timeout = timeout;
631 self
632 }
633
634 pub fn connect_timeout(mut self, timeout: Duration) -> Self {
636 self.connect_timeout = timeout;
637 self
638 }
639
640 pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
642 self.user_agent = user_agent.into();
643 self
644 }
645
646 pub fn follow_redirects(mut self, follow: bool) -> Self {
649 self.follow_redirects = follow;
650 self
651 }
652
653 pub fn min_request_interval(mut self, interval: Duration) -> Self {
659 self.min_request_interval = interval;
660 self
661 }
662
663 pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
668 self.max_rps = Some(rps);
669 self
670 }
671
672 pub fn max_retries(mut self, n: u32) -> Self {
675 self.retry.max_retries = n;
676 self
677 }
678
679 pub fn base_backoff_delay(mut self, d: Duration) -> Self {
682 self.retry.base_delay = d;
683 self
684 }
685
686 pub fn max_backoff_delay(mut self, d: Duration) -> Self {
688 self.retry.max_delay = d;
689 self
690 }
691
692 pub fn proxy(mut self, url: impl Into<String>) -> Self {
695 self.proxy = Some(url.into());
696 self
697 }
698
699 pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
703 self.user_agents = agents;
704 self
705 }
706
707 pub fn enrich(mut self, enrich: bool) -> Self {
710 self.enrich = enrich;
711 self
712 }
713
714 pub fn respect_robots(mut self, respect: bool) -> Self {
718 self.respect_robots = respect;
719 self
720 }
721
722 pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
726 self.browser = Some(backend);
727 self
728 }
729
730 pub const fn browser_budget(mut self, cap: usize) -> Self {
735 self.browser_budget = cap;
736 self
737 }
738
739 pub const fn escalation_budget(mut self, cap: usize) -> Self {
746 self.escalation_budget = cap;
747 self
748 }
749
750 pub const fn disable_escalation(mut self) -> Self {
755 self.escalation_enabled = false;
756 self
757 }
758
759 pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
764 self.egress = egress;
765 self
766 }
767
768 pub fn sessions(mut self, sessions: SessionStore) -> Self {
774 self.sessions = sessions;
775 self
776 }
777
778 pub fn build(self) -> Result<Client> {
780 let inner = build_reqwest(
781 &self.user_agent,
782 self.timeout,
783 self.connect_timeout,
784 self.follow_redirects,
785 self.redirect_limit,
786 self.proxy.as_deref(),
787 )?;
788
789 let mut egress_entries = Vec::with_capacity(self.egress.len());
793 for spec in &self.egress {
794 let client = build_reqwest(
795 &self.user_agent,
796 self.timeout,
797 self.connect_timeout,
798 self.follow_redirects,
799 self.redirect_limit,
800 Some(&spec.url),
801 )?;
802 egress_entries.push((
803 spec.name.clone(),
804 spec.country.clone(),
805 spec.kind,
806 Arc::new(HttpFetcher::new(client)),
807 ));
808 }
809
810 let global_throttle = self.max_rps.map(|rps| {
811 let interval = Duration::from_secs(1) / rps.get();
813 HostThrottle::new(interval)
814 });
815 let robots = self
816 .respect_robots
817 .then(|| RobotsCache::new(inner.clone(), "adler"));
818 #[cfg(feature = "impersonate")]
822 let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
823 Ok(Client {
824 http: Arc::new(HttpFetcher::new(inner)),
825 egress: Arc::new(EgressPool::new(egress_entries)),
826 sessions: Arc::new(self.sessions),
827 throttle: HostThrottle::new(self.min_request_interval),
828 global_throttle,
829 retry: self.retry,
830 user_agents: Arc::from(self.user_agents),
831 enrich: self.enrich,
832 robots,
833 browser: self.browser,
834 browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
835 escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
836 self.escalation_budget,
837 )),
838 escalation_enabled: self.escalation_enabled,
839 #[cfg(feature = "impersonate")]
840 impersonate,
841 })
842 }
843}
844
845fn build_reqwest(
849 user_agent: &str,
850 timeout: Duration,
851 connect_timeout: Duration,
852 follow_redirects: bool,
853 redirect_limit: usize,
854 proxy: Option<&str>,
855) -> Result<reqwest::Client> {
856 let redirect_policy = if follow_redirects {
857 redirect::Policy::limited(redirect_limit)
858 } else {
859 redirect::Policy::none()
860 };
861 let mut builder = reqwest::Client::builder()
862 .user_agent(user_agent.to_owned())
863 .timeout(timeout)
864 .connect_timeout(connect_timeout)
865 .redirect(redirect_policy);
866 if let Some(proxy_url) = proxy {
867 const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
871 if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
872 return Err(Error::HttpSetup {
873 message: format!(
874 "invalid proxy {proxy_url:?}: must start with one of {}",
875 SCHEMES.join(", ")
876 ),
877 });
878 }
879 let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
880 message: format!("invalid proxy {proxy_url:?}: {e}"),
881 })?;
882 builder = builder.proxy(proxy);
883 }
884 builder.build().map_err(|e| Error::HttpSetup {
885 message: e.to_string(),
886 })
887}
888
889pub const DEFAULT_BROWSER_BUDGET: usize = 50;
896
897pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
907
908impl fmt::Debug for Client {
909 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
910 f.debug_struct("Client")
911 .field("throttle", &self.throttle)
912 .field("global_throttle", &self.global_throttle)
913 .field("retry", &self.retry)
914 .field("user_agents", &self.user_agents)
915 .field("enrich", &self.enrich)
916 .field("robots", &self.robots.is_some())
917 .field("browser", &self.browser.is_some())
918 .field("browser_budget", &self.browser_budget)
919 .field("escalation_budget", &self.escalation_budget)
920 .field("escalation_enabled", &self.escalation_enabled)
921 .finish_non_exhaustive()
922 }
923}
924
925impl fmt::Debug for ClientBuilder {
926 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
927 f.debug_struct("ClientBuilder")
928 .field("timeout", &self.timeout)
929 .field("connect_timeout", &self.connect_timeout)
930 .field("user_agent", &self.user_agent)
931 .field("follow_redirects", &self.follow_redirects)
932 .field("redirect_limit", &self.redirect_limit)
933 .field("min_request_interval", &self.min_request_interval)
934 .field("max_rps", &self.max_rps)
935 .field("retry", &self.retry)
936 .field("proxy", &self.proxy)
937 .field("user_agents", &self.user_agents)
938 .field("enrich", &self.enrich)
939 .field("respect_robots", &self.respect_robots)
940 .field("browser", &self.browser.is_some())
941 .field("browser_budget", &self.browser_budget)
942 .field("egress", &self.egress)
943 .field("sessions", &self.sessions)
944 .field("escalation_budget", &self.escalation_budget)
945 .field("escalation_enabled", &self.escalation_enabled)
946 .finish()
947 }
948}
949
950const BOT_PROTECTED_TAG: &str = "bot-protected";
951
952fn default_user_agent() -> String {
953 format!("adler/{}", env!("CARGO_PKG_VERSION"))
954}
955
956fn host_of(url: &str) -> String {
957 reqwest::Url::parse(url)
958 .ok()
959 .and_then(|u| u.host_str().map(str::to_owned))
960 .unwrap_or_else(|| "unknown".into())
961}
962
963fn origin_and_path(url: &str) -> Option<(String, String)> {
966 let parsed = reqwest::Url::parse(url).ok()?;
967 let host = parsed.host_str()?;
968 let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
969 let origin = format!("{}://{host}{port}", parsed.scheme());
970 let path = parsed.query().map_or_else(
971 || parsed.path().to_owned(),
972 |q| format!("{}?{q}", parsed.path()),
973 );
974 Some((origin, path))
975}
976
977fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
978 CheckOutcome {
979 site: site.to_owned(),
980 url,
981 kind,
982 reason: None,
983 elapsed_ms: elapsed_ms(started),
984 enrichment: std::collections::BTreeMap::new(),
985 evidence: Vec::new(),
986 transport: None,
987 escalations: 0,
988 }
989}
990
991fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
992 CheckOutcome {
993 site: site.to_owned(),
994 url,
995 kind: MatchKind::Uncertain,
996 reason: Some(reason),
997 elapsed_ms: elapsed_ms(started),
998 enrichment: std::collections::BTreeMap::new(),
999 evidence: Vec::new(),
1000 transport: None,
1001 escalations: 0,
1002 }
1003}
1004
1005fn elapsed_ms(started: Instant) -> u64 {
1006 u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011 use super::*;
1012 use crate::browser::RenderedPage;
1013 use crate::site::{Signal, UrlTemplate};
1014 use wiremock::matchers::{any, method, path};
1015 use wiremock::{Mock, MockServer, ResponseTemplate};
1016
1017 fn build_client() -> Client {
1018 Client::builder()
1019 .timeout(Duration::from_secs(2))
1020 .min_request_interval(Duration::ZERO)
1023 .max_retries(0)
1026 .build()
1027 .expect("client builds")
1028 }
1029
1030 fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
1031 Site {
1032 name: "Mock".into(),
1033 url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
1034 signals,
1035 known_present: None,
1036 known_absent: None,
1037 extract: Vec::new(),
1038 tags: Vec::new(),
1039 request_headers: std::collections::BTreeMap::new(),
1040 regex_check: None,
1041 engine: None,
1042 strip_bad_char: None,
1043 request_method: crate::site::HttpMethod::Get,
1044 request_body: None,
1045 protection: Vec::new(),
1046 disabled: false,
1047 source: None,
1048 popularity: None,
1049 access: crate::AccessPolicy::default(),
1050 }
1051 }
1052
1053 fn user() -> Username {
1054 Username::new("alice").unwrap()
1055 }
1056
1057 #[tokio::test]
1058 async fn regex_check_short_circuits_before_any_request() {
1059 let server = MockServer::start().await;
1063 Mock::given(any())
1064 .respond_with(ResponseTemplate::new(200))
1065 .mount(&server)
1066 .await;
1067 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1068 site.regex_check = Some("^[A-Za-z]{8,}$".into());
1070 let outcome = build_client().check(&site, &user()).await;
1071 assert_eq!(outcome.kind, MatchKind::Uncertain);
1072 assert!(
1073 matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
1074 "expected UsernameNotAllowed, got {:?}",
1075 outcome.reason,
1076 );
1077 let recvd = server.received_requests().await.unwrap_or_default();
1080 assert_eq!(
1081 recvd.len(),
1082 0,
1083 "regex_check mismatch must skip the HTTP request entirely"
1084 );
1085 }
1086
1087 #[tokio::test]
1088 async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
1089 let server = MockServer::start().await;
1092 Mock::given(any())
1093 .respond_with(ResponseTemplate::new(200))
1094 .mount(&server)
1095 .await;
1096 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1097 site.access = crate::access::AccessPolicy {
1100 geo: vec![crate::access::CountryCode::new("pl").unwrap()],
1101 ..crate::access::AccessPolicy::default()
1102 };
1103 let outcome = build_client().check(&site, &user()).await;
1104 assert_eq!(outcome.kind, MatchKind::Uncertain);
1105 assert!(
1106 matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
1107 "expected GeoUnavailable, got {:?}",
1108 outcome.reason,
1109 );
1110 let recvd = server.received_requests().await.unwrap_or_default();
1113 assert_eq!(
1114 recvd.len(),
1115 0,
1116 "geo-unavailable must skip the HTTP request entirely"
1117 );
1118 }
1119
1120 #[tokio::test]
1121 async fn session_headers_are_sent_on_probe() {
1122 let server = MockServer::start().await;
1125 Mock::given(any())
1126 .and(wiremock::matchers::header("cookie", "sessionid=real"))
1127 .respond_with(ResponseTemplate::new(200))
1128 .mount(&server)
1129 .await;
1130 let mut headers = std::collections::BTreeMap::new();
1131 headers.insert("Cookie".to_string(), "sessionid=real".to_string());
1132 let mut store = SessionStore::new();
1133 store.insert("acct", crate::access::Session::from_headers(headers));
1134 let client = Client::builder()
1135 .timeout(Duration::from_secs(2))
1136 .min_request_interval(Duration::ZERO)
1137 .max_retries(0)
1138 .sessions(store)
1139 .build()
1140 .expect("client builds");
1141 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1142 site.access.session = Some("acct".to_string());
1143 let outcome = client.check(&site, &user()).await;
1144 assert_eq!(
1145 outcome.kind,
1146 MatchKind::Found,
1147 "session cookie should unlock the 200 (got {:?})",
1148 outcome.reason,
1149 );
1150 }
1151
1152 #[tokio::test]
1153 async fn missing_named_session_is_session_required() {
1154 let server = MockServer::start().await;
1155 Mock::given(any())
1156 .respond_with(ResponseTemplate::new(200))
1157 .mount(&server)
1158 .await;
1159 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1160 site.access.session = Some("not-configured".to_string());
1162 let outcome = build_client().check(&site, &user()).await;
1163 assert_eq!(outcome.kind, MatchKind::Uncertain);
1164 assert!(
1165 matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
1166 "expected SessionRequired, got {:?}",
1167 outcome.reason,
1168 );
1169 let recvd = server.received_requests().await.unwrap_or_default();
1170 assert_eq!(
1171 recvd.len(),
1172 0,
1173 "a missing session must skip the request, not probe unauthenticated"
1174 );
1175 }
1176
1177 #[cfg(feature = "impersonate")]
1178 #[tokio::test]
1179 async fn impersonate_routes_pure_tls_fingerprint_site() {
1180 let server = MockServer::start().await;
1181 Mock::given(any())
1182 .respond_with(ResponseTemplate::new(200))
1183 .mount(&server)
1184 .await;
1185 let client = Client::builder()
1186 .timeout(Duration::from_secs(2))
1187 .min_request_interval(Duration::ZERO)
1188 .max_retries(0)
1189 .build()
1190 .expect("client builds with impersonate");
1191 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1192 site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1195 let outcome = client.check(&site, &user()).await;
1196 assert_eq!(
1197 outcome.kind,
1198 MatchKind::Found,
1199 "expected Found (reason {:?})",
1200 outcome.reason,
1201 );
1202 let recvd = server.received_requests().await.expect("received requests");
1206 assert_eq!(recvd.len(), 1, "expected exactly one request");
1207 let ua = recvd[0]
1208 .headers
1209 .get("user-agent")
1210 .and_then(|v| v.to_str().ok())
1211 .unwrap_or("");
1212 assert!(
1213 ua.contains("Chrome/"),
1214 "expected Chrome-shaped UA from wreq, got {ua:?}"
1215 );
1216 }
1217
1218 #[tokio::test]
1219 async fn regex_check_pass_proceeds_to_probe() {
1220 let server = MockServer::start().await;
1221 Mock::given(any())
1222 .and(path("/alice"))
1223 .respond_with(ResponseTemplate::new(200))
1224 .mount(&server)
1225 .await;
1226 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1227 site.regex_check = Some("^[a-z]{3,}$".into());
1229 let outcome = build_client().check(&site, &user()).await;
1230 assert_eq!(outcome.kind, MatchKind::Found);
1231 }
1232
1233 #[tokio::test]
1234 async fn status_signal_reports_found_on_match() {
1235 let server = MockServer::start().await;
1236 Mock::given(any())
1237 .and(path("/alice"))
1238 .respond_with(ResponseTemplate::new(200))
1239 .mount(&server)
1240 .await;
1241 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1242 let outcome = build_client().check(&site, &user()).await;
1243 assert_eq!(outcome.kind, MatchKind::Found);
1244 assert!(outcome.url.ends_with("/alice"));
1245 assert!(outcome.reason.is_none());
1246 assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1247 }
1248
1249 #[tokio::test]
1250 async fn status_signal_pair_reports_not_found_on_404() {
1251 let server = MockServer::start().await;
1252 Mock::given(any())
1253 .and(path("/alice"))
1254 .respond_with(ResponseTemplate::new(404))
1255 .mount(&server)
1256 .await;
1257 let site = site_with(
1258 &server,
1259 vec![
1260 Signal::StatusFound { codes: vec![200] },
1261 Signal::StatusNotFound { codes: vec![404] },
1262 ],
1263 );
1264 let outcome = build_client().check(&site, &user()).await;
1265 assert_eq!(outcome.kind, MatchKind::NotFound);
1266 assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1268 }
1269
1270 #[tokio::test]
1271 async fn body_absent_signal_detects_missing_account() {
1272 let server = MockServer::start().await;
1273 Mock::given(any())
1274 .and(path("/alice"))
1275 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1276 .mount(&server)
1277 .await;
1278 let site = site_with(
1279 &server,
1280 vec![Signal::BodyAbsent {
1281 text: "Profile not found".into(),
1282 }],
1283 );
1284 let outcome = build_client().check(&site, &user()).await;
1285 assert_eq!(outcome.kind, MatchKind::NotFound);
1286 }
1287
1288 #[tokio::test]
1289 async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1290 let server = MockServer::start().await;
1293 Mock::given(any())
1294 .and(path("/alice"))
1295 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1296 .mount(&server)
1297 .await;
1298 let site = site_with(
1299 &server,
1300 vec![Signal::BodyAbsent {
1301 text: "Profile not found".into(),
1302 }],
1303 );
1304 let outcome = build_client().check(&site, &user()).await;
1305 assert_eq!(outcome.kind, MatchKind::Uncertain);
1306 }
1307
1308 #[tokio::test]
1309 async fn body_present_plus_absent_resolve_to_found() {
1310 let server = MockServer::start().await;
1311 Mock::given(any())
1312 .and(path("/alice"))
1313 .respond_with(
1314 ResponseTemplate::new(200)
1315 .set_body_string(r#"<div class="profile-card">alice</div>"#),
1316 )
1317 .mount(&server)
1318 .await;
1319 let site = site_with(
1320 &server,
1321 vec![
1322 Signal::BodyPresent {
1323 text: "profile-card".into(),
1324 },
1325 Signal::BodyAbsent {
1326 text: "Profile not found".into(),
1327 },
1328 ],
1329 );
1330 let outcome = build_client().check(&site, &user()).await;
1331 assert_eq!(outcome.kind, MatchKind::Found);
1332 }
1333
1334 #[tokio::test]
1335 async fn redirect_absent_signal_detects_missing_account() {
1336 let server = MockServer::start().await;
1337 Mock::given(any())
1338 .and(path("/alice"))
1339 .respond_with(
1340 ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1341 )
1342 .mount(&server)
1343 .await;
1344 Mock::given(any())
1345 .and(path("/login"))
1346 .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1347 .mount(&server)
1348 .await;
1349 let site = site_with(
1350 &server,
1351 vec![Signal::RedirectAbsent {
1352 fragment: "/login".into(),
1353 }],
1354 );
1355 let outcome = build_client().check(&site, &user()).await;
1356 assert_eq!(outcome.kind, MatchKind::NotFound);
1357 }
1358
1359 #[tokio::test]
1360 async fn negative_signal_wins_over_positive() {
1361 let server = MockServer::start().await;
1366 Mock::given(any())
1367 .and(path("/alice"))
1368 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1369 .mount(&server)
1370 .await;
1371 let site = site_with(
1372 &server,
1373 vec![
1374 Signal::StatusFound { codes: vec![200] },
1375 Signal::BodyAbsent {
1376 text: "Profile not found".into(),
1377 },
1378 ],
1379 );
1380 let outcome = build_client().check(&site, &user()).await;
1381 assert_eq!(outcome.kind, MatchKind::NotFound);
1382 }
1383
1384 #[tokio::test]
1385 async fn network_failure_yields_uncertain() {
1386 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1387 let port = listener.local_addr().unwrap().port();
1388 drop(listener);
1389
1390 let site = Site {
1391 name: "Dead".into(),
1392 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1393 signals: vec![Signal::StatusFound { codes: vec![200] }],
1394 known_present: None,
1395 known_absent: None,
1396 extract: Vec::new(),
1397 tags: Vec::new(),
1398 request_headers: std::collections::BTreeMap::new(),
1399 regex_check: None,
1400 engine: None,
1401 strip_bad_char: None,
1402 request_method: crate::site::HttpMethod::Get,
1403 request_body: None,
1404 protection: Vec::new(),
1405 disabled: false,
1406 source: None,
1407 popularity: None,
1408 access: crate::AccessPolicy::default(),
1409 };
1410 let client = Client::builder()
1411 .timeout(Duration::from_millis(500))
1412 .connect_timeout(Duration::from_millis(500))
1413 .max_retries(0)
1414 .build()
1415 .unwrap();
1416 let outcome = client.check(&site, &user()).await;
1417 assert_eq!(outcome.kind, MatchKind::Uncertain);
1418 assert!(outcome.reason.is_some());
1419 }
1420
1421 #[tokio::test]
1422 async fn throttle_spaces_consecutive_calls_to_same_host() {
1423 let server = MockServer::start().await;
1424 Mock::given(any())
1425 .and(path("/alice"))
1426 .respond_with(ResponseTemplate::new(200))
1427 .mount(&server)
1428 .await;
1429 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1430 let client = Client::builder()
1435 .timeout(Duration::from_secs(2))
1436 .min_request_interval(Duration::from_millis(300))
1437 .build()
1438 .unwrap();
1439
1440 client.check(&site, &user()).await;
1441 let started = Instant::now();
1442 client.check(&site, &user()).await;
1443 let elapsed = started.elapsed();
1444 assert!(
1445 elapsed >= Duration::from_millis(200),
1446 "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1447 );
1448 }
1449
1450 #[tokio::test]
1451 async fn builder_overrides_user_agent() {
1452 let server = MockServer::start().await;
1453 Mock::given(any())
1454 .and(path("/alice"))
1455 .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1456 .respond_with(ResponseTemplate::new(200))
1457 .mount(&server)
1458 .await;
1459 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1460 let client = Client::builder()
1461 .user_agent("adler-test/1.0")
1462 .build()
1463 .unwrap();
1464 let outcome = client.check(&site, &user()).await;
1465 assert_eq!(outcome.kind, MatchKind::Found);
1466 }
1467
1468 #[tokio::test]
1469 async fn rate_limit_429_yields_uncertain_with_note() {
1470 let server = MockServer::start().await;
1471 Mock::given(any())
1472 .and(path("/alice"))
1473 .respond_with(ResponseTemplate::new(429))
1474 .mount(&server)
1475 .await;
1476 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1477 let outcome = build_client().check(&site, &user()).await;
1478 assert_eq!(outcome.kind, MatchKind::Uncertain);
1479 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1480 }
1481
1482 #[tokio::test]
1483 async fn cloudflare_server_header_yields_uncertain() {
1484 let server = MockServer::start().await;
1485 Mock::given(any())
1486 .and(path("/alice"))
1487 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1488 .mount(&server)
1489 .await;
1490 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1491 let outcome = build_client().check(&site, &user()).await;
1492 assert_eq!(outcome.kind, MatchKind::Uncertain);
1493 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1494 }
1495
1496 #[tokio::test]
1497 async fn cloudflare_interstitial_in_body_yields_uncertain() {
1498 let server = MockServer::start().await;
1501 Mock::given(any())
1502 .and(path("/alice"))
1503 .respond_with(
1504 ResponseTemplate::new(200)
1505 .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1506 )
1507 .mount(&server)
1508 .await;
1509 let site = site_with(
1510 &server,
1511 vec![Signal::BodyAbsent {
1512 text: "Profile not found".into(),
1513 }],
1514 );
1515 let outcome = build_client().check(&site, &user()).await;
1516 assert_eq!(outcome.kind, MatchKind::Uncertain);
1517 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1518 }
1519
1520 #[tokio::test]
1521 async fn ban_detection_does_not_fire_on_legitimate_403() {
1522 let server = MockServer::start().await;
1523 Mock::given(any())
1524 .and(path("/alice"))
1525 .respond_with(ResponseTemplate::new(403))
1526 .mount(&server)
1527 .await;
1528 let site = site_with(
1529 &server,
1530 vec![
1531 Signal::StatusFound { codes: vec![200] },
1532 Signal::StatusNotFound { codes: vec![403] },
1533 ],
1534 );
1535 let outcome = build_client().check(&site, &user()).await;
1536 assert_eq!(outcome.kind, MatchKind::NotFound);
1538 assert!(outcome.reason.is_none());
1539 }
1540
1541 #[tokio::test]
1542 async fn retry_recovers_after_transient_429() {
1543 let server = MockServer::start().await;
1544 Mock::given(any())
1546 .and(path("/alice"))
1547 .respond_with(ResponseTemplate::new(429))
1548 .up_to_n_times(1)
1549 .mount(&server)
1550 .await;
1551 Mock::given(any())
1552 .and(path("/alice"))
1553 .respond_with(ResponseTemplate::new(200))
1554 .mount(&server)
1555 .await;
1556 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1557 let client = Client::builder()
1558 .timeout(Duration::from_secs(2))
1559 .min_request_interval(Duration::ZERO)
1560 .max_retries(2)
1561 .base_backoff_delay(Duration::from_millis(20))
1562 .max_backoff_delay(Duration::from_millis(100))
1563 .build()
1564 .unwrap();
1565 let outcome = client.check(&site, &user()).await;
1566 assert_eq!(outcome.kind, MatchKind::Found);
1567 assert!(outcome.reason.is_none());
1568 }
1569
1570 #[tokio::test]
1571 async fn retry_exhausts_and_returns_uncertain() {
1572 let server = MockServer::start().await;
1573 Mock::given(any())
1574 .and(path("/alice"))
1575 .respond_with(ResponseTemplate::new(429))
1576 .mount(&server)
1577 .await;
1578 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1579 let client = Client::builder()
1580 .timeout(Duration::from_secs(2))
1581 .min_request_interval(Duration::ZERO)
1582 .max_retries(2)
1583 .base_backoff_delay(Duration::from_millis(10))
1584 .max_backoff_delay(Duration::from_millis(50))
1585 .build()
1586 .unwrap();
1587 let outcome = client.check(&site, &user()).await;
1588 assert_eq!(outcome.kind, MatchKind::Uncertain);
1589 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1590 }
1591
1592 #[tokio::test]
1593 async fn retry_does_not_fire_on_network_error() {
1594 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1598 let port = listener.local_addr().unwrap().port();
1599 drop(listener);
1600 let site = Site {
1601 name: "Dead".into(),
1602 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1603 signals: vec![Signal::StatusFound { codes: vec![200] }],
1604 known_present: None,
1605 known_absent: None,
1606 extract: Vec::new(),
1607 tags: Vec::new(),
1608 request_headers: std::collections::BTreeMap::new(),
1609 regex_check: None,
1610 engine: None,
1611 strip_bad_char: None,
1612 request_method: crate::site::HttpMethod::Get,
1613 request_body: None,
1614 protection: Vec::new(),
1615 disabled: false,
1616 source: None,
1617 popularity: None,
1618 access: crate::AccessPolicy::default(),
1619 };
1620 let client = Client::builder()
1621 .timeout(Duration::from_millis(500))
1622 .connect_timeout(Duration::from_millis(500))
1623 .min_request_interval(Duration::ZERO)
1624 .max_retries(3)
1625 .base_backoff_delay(Duration::from_secs(60))
1626 .build()
1627 .unwrap();
1628 let started = Instant::now();
1629 let outcome = client.check(&site, &user()).await;
1630 assert!(started.elapsed() < Duration::from_secs(5));
1633 assert_eq!(outcome.kind, MatchKind::Uncertain);
1634 assert!(
1635 matches!(outcome.reason, Some(UncertainReason::Network(_))),
1636 "got {:?}",
1637 outcome.reason,
1638 );
1639 }
1640
1641 #[tokio::test]
1642 async fn rotates_user_agent_per_request() {
1643 let server = MockServer::start().await;
1647 Mock::given(any())
1648 .and(path("/alice"))
1649 .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1650 .respond_with(ResponseTemplate::new(200))
1651 .mount(&server)
1652 .await;
1653 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1654 let client = Client::builder()
1655 .min_request_interval(Duration::ZERO)
1656 .max_retries(0)
1657 .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1658 .build()
1659 .unwrap();
1660 let outcome = client.check(&site, &user()).await;
1661 assert_eq!(outcome.kind, MatchKind::Found);
1662 }
1663
1664 #[test]
1665 fn invalid_proxy_url_fails_build() {
1666 let err = Client::builder().proxy("not a url").build().unwrap_err();
1667 assert!(matches!(err, Error::HttpSetup { .. }));
1668 }
1669
1670 #[test]
1671 fn schemeless_proxy_is_rejected_up_front() {
1672 let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1674 let Error::HttpSetup { message } = err else {
1675 panic!("expected HttpSetup, got {err:?}");
1676 };
1677 assert!(message.contains("must start with"), "{message}");
1678 }
1679
1680 #[test]
1681 fn socks5_proxy_scheme_is_accepted() {
1682 assert!(
1684 Client::builder()
1685 .proxy("socks5://127.0.0.1:9050")
1686 .build()
1687 .is_ok()
1688 );
1689 }
1690
1691 #[tokio::test]
1692 async fn global_rps_cap_spaces_requests_across_hosts() {
1693 let server = MockServer::start().await;
1696 Mock::given(any())
1697 .respond_with(ResponseTemplate::new(200))
1698 .mount(&server)
1699 .await;
1700 let site_a = Site {
1701 name: "A".into(),
1702 url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1703 signals: vec![Signal::StatusFound { codes: vec![200] }],
1704 known_present: None,
1705 known_absent: None,
1706 extract: Vec::new(),
1707 tags: Vec::new(),
1708 request_headers: std::collections::BTreeMap::new(),
1709 regex_check: None,
1710 engine: None,
1711 strip_bad_char: None,
1712 request_method: crate::site::HttpMethod::Get,
1713 request_body: None,
1714 protection: Vec::new(),
1715 disabled: false,
1716 source: None,
1717 popularity: None,
1718 access: crate::AccessPolicy::default(),
1719 };
1720 let site_b = Site {
1721 name: "B".into(),
1722 url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1723 signals: vec![Signal::StatusFound { codes: vec![200] }],
1724 known_present: None,
1725 known_absent: None,
1726 extract: Vec::new(),
1727 tags: Vec::new(),
1728 request_headers: std::collections::BTreeMap::new(),
1729 regex_check: None,
1730 engine: None,
1731 strip_bad_char: None,
1732 request_method: crate::site::HttpMethod::Get,
1733 request_body: None,
1734 protection: Vec::new(),
1735 disabled: false,
1736 source: None,
1737 popularity: None,
1738 access: crate::AccessPolicy::default(),
1739 };
1740 let client = Client::builder()
1745 .min_request_interval(Duration::ZERO)
1746 .max_retries(0)
1747 .max_rps(std::num::NonZeroU32::new(2).unwrap())
1748 .build()
1749 .unwrap();
1750 client.check(&site_a, &user()).await;
1753 let started = Instant::now();
1754 client.check(&site_b, &user()).await;
1755 assert!(
1756 started.elapsed() >= Duration::from_millis(350),
1757 "global cap should space cross-host requests, got {:?}",
1758 started.elapsed(),
1759 );
1760 }
1761
1762 #[tokio::test]
1763 async fn respect_robots_skips_disallowed_paths() {
1764 let server = MockServer::start().await;
1765 Mock::given(any())
1766 .and(path("/robots.txt"))
1767 .respond_with(
1768 ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1769 )
1770 .mount(&server)
1771 .await;
1772 Mock::given(any())
1773 .and(path("/no/alice"))
1774 .respond_with(ResponseTemplate::new(200))
1775 .mount(&server)
1776 .await;
1777 Mock::given(any())
1778 .and(path("/yes/alice"))
1779 .respond_with(ResponseTemplate::new(200))
1780 .mount(&server)
1781 .await;
1782 let client = Client::builder()
1783 .min_request_interval(Duration::ZERO)
1784 .max_retries(0)
1785 .respect_robots(true)
1786 .build()
1787 .unwrap();
1788
1789 let disallowed = Site {
1790 name: "No".into(),
1791 url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1792 signals: vec![Signal::StatusFound { codes: vec![200] }],
1793 known_present: None,
1794 known_absent: None,
1795 extract: Vec::new(),
1796 tags: Vec::new(),
1797 request_headers: std::collections::BTreeMap::new(),
1798 regex_check: None,
1799 engine: None,
1800 strip_bad_char: None,
1801 request_method: crate::site::HttpMethod::Get,
1802 request_body: None,
1803 protection: Vec::new(),
1804 disabled: false,
1805 source: None,
1806 popularity: None,
1807 access: crate::AccessPolicy::default(),
1808 };
1809 let allowed = Site {
1810 name: "Yes".into(),
1811 url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1812 signals: vec![Signal::StatusFound { codes: vec![200] }],
1813 known_present: None,
1814 known_absent: None,
1815 extract: Vec::new(),
1816 tags: Vec::new(),
1817 request_headers: std::collections::BTreeMap::new(),
1818 regex_check: None,
1819 engine: None,
1820 strip_bad_char: None,
1821 request_method: crate::site::HttpMethod::Get,
1822 request_body: None,
1823 protection: Vec::new(),
1824 disabled: false,
1825 source: None,
1826 popularity: None,
1827 access: crate::AccessPolicy::default(),
1828 };
1829
1830 let no = client.check(&disallowed, &user()).await;
1831 assert_eq!(no.kind, MatchKind::Uncertain);
1832 assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1833
1834 let yes = client.check(&allowed, &user()).await;
1835 assert_eq!(yes.kind, MatchKind::Found);
1836 }
1837
1838 #[tokio::test]
1839 async fn body_read_skipped_when_no_body_signal_needed() {
1840 let server = MockServer::start().await;
1843 Mock::given(any())
1844 .and(path("/alice"))
1845 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1846 .mount(&server)
1847 .await;
1848 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1849 let outcome = build_client().check(&site, &user()).await;
1850 assert_eq!(outcome.kind, MatchKind::Found);
1851 }
1852
1853 #[derive(Debug)]
1859 struct RecordingBackend {
1860 page: RenderedPage,
1861 calls: std::sync::atomic::AtomicUsize,
1862 }
1863
1864 impl RecordingBackend {
1865 fn with_page(page: RenderedPage) -> Self {
1866 Self {
1867 page,
1868 calls: std::sync::atomic::AtomicUsize::new(0),
1869 }
1870 }
1871 fn call_count(&self) -> usize {
1872 self.calls.load(std::sync::atomic::Ordering::SeqCst)
1873 }
1874 }
1875
1876 #[async_trait::async_trait]
1877 impl BrowserBackend for RecordingBackend {
1878 async fn fetch(
1879 &self,
1880 _url: &url::Url,
1881 _headers: &std::collections::BTreeMap<String, String>,
1882 _timeout: Duration,
1883 ) -> Result<RenderedPage> {
1884 self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1885 Ok(self.page.clone())
1886 }
1887 }
1888
1889 fn site_bot_protected(server: &MockServer) -> Site {
1890 let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1891 s.tags = vec!["bot-protected".into()];
1892 s
1893 }
1894
1895 #[tokio::test]
1896 async fn browser_routes_bot_protected_sites() {
1897 let server = MockServer::start().await;
1900 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1901 status: 200,
1902 final_url: url::Url::parse("https://example.com/alice").unwrap(),
1903 body: "<html></html>".into(),
1904 elapsed_ms: 42,
1905 }));
1906 let client = Client::builder()
1907 .min_request_interval(Duration::ZERO)
1908 .max_retries(0)
1909 .browser(backend.clone())
1910 .build()
1911 .unwrap();
1912 let outcome = client.check(&site_bot_protected(&server), &user()).await;
1913 assert_eq!(outcome.kind, MatchKind::Found);
1914 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1915 }
1916
1917 #[tokio::test]
1918 async fn non_bot_protected_sites_skip_browser() {
1919 let server = MockServer::start().await;
1920 Mock::given(any())
1921 .and(path("/alice"))
1922 .respond_with(ResponseTemplate::new(200))
1923 .mount(&server)
1924 .await;
1925 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1926 status: 500, final_url: url::Url::parse("https://x/").unwrap(),
1928 body: String::new(),
1929 elapsed_ms: 0,
1930 }));
1931 let client = Client::builder()
1932 .min_request_interval(Duration::ZERO)
1933 .max_retries(0)
1934 .browser(backend.clone())
1935 .build()
1936 .unwrap();
1937 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1939 let outcome = client.check(&site, &user()).await;
1940 assert_eq!(outcome.kind, MatchKind::Found);
1941 assert_eq!(backend.call_count(), 0, "browser must not be touched");
1942 }
1943
1944 #[tokio::test]
1945 async fn browser_budget_exhaust_yields_uncertain() {
1946 let server = MockServer::start().await;
1947 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1948 status: 200,
1949 final_url: url::Url::parse("https://x/").unwrap(),
1950 body: String::new(),
1951 elapsed_ms: 0,
1952 }));
1953 let client = Client::builder()
1954 .min_request_interval(Duration::ZERO)
1955 .max_retries(0)
1956 .browser(backend.clone())
1957 .browser_budget(1)
1958 .build()
1959 .unwrap();
1960 let site = site_bot_protected(&server);
1961 let first = client.check(&site, &user()).await;
1963 assert_eq!(first.kind, MatchKind::Found);
1964 let second = client.check(&site, &user()).await;
1966 assert_eq!(second.kind, MatchKind::Uncertain);
1967 assert!(matches!(
1968 second.reason,
1969 Some(UncertainReason::BrowserBudget)
1970 ));
1971 assert_eq!(
1972 backend.call_count(),
1973 1,
1974 "second call must not invoke backend"
1975 );
1976 }
1977
1978 #[tokio::test]
1979 async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1980 struct FailingBackend;
1981 #[async_trait::async_trait]
1982 impl BrowserBackend for FailingBackend {
1983 async fn fetch(
1984 &self,
1985 _url: &url::Url,
1986 _headers: &std::collections::BTreeMap<String, String>,
1987 _timeout: Duration,
1988 ) -> Result<RenderedPage> {
1989 Err(Error::BrowserSetup {
1990 message: "simulated crash".into(),
1991 })
1992 }
1993 }
1994 impl std::fmt::Debug for FailingBackend {
1995 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1996 f.write_str("FailingBackend")
1997 }
1998 }
1999
2000 let server = MockServer::start().await;
2001 let client = Client::builder()
2002 .min_request_interval(Duration::ZERO)
2003 .max_retries(0)
2004 .browser(Arc::new(FailingBackend))
2005 .build()
2006 .unwrap();
2007 let outcome = client.check(&site_bot_protected(&server), &user()).await;
2008 assert_eq!(outcome.kind, MatchKind::Uncertain);
2009 match outcome.reason {
2010 Some(UncertainReason::BrowserFailed(msg)) => {
2011 assert!(msg.contains("simulated crash"), "got: {msg}");
2012 }
2013 other => panic!("expected BrowserFailed, got {other:?}"),
2014 }
2015 }
2016
2017 #[tokio::test]
2018 async fn status_only_site_uses_head_request() {
2019 let server = MockServer::start().await;
2023 Mock::given(method("HEAD"))
2024 .and(path("/alice"))
2025 .respond_with(ResponseTemplate::new(200))
2026 .mount(&server)
2027 .await;
2028 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2029 let outcome = build_client().check(&site, &user()).await;
2030 assert_eq!(outcome.kind, MatchKind::Found);
2031 let recvd = server.received_requests().await.unwrap_or_default();
2032 assert_eq!(recvd.len(), 1);
2033 assert_eq!(recvd[0].method.as_str(), "HEAD");
2034 }
2035
2036 #[tokio::test]
2037 async fn body_signal_site_uses_get_request() {
2038 let server = MockServer::start().await;
2041 Mock::given(any())
2042 .and(path("/alice"))
2043 .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
2044 .mount(&server)
2045 .await;
2046 let site = site_with(
2047 &server,
2048 vec![Signal::BodyPresent {
2049 text: "hello".into(),
2050 }],
2051 );
2052 let outcome = build_client().check(&site, &user()).await;
2053 assert_eq!(outcome.kind, MatchKind::Found);
2054 let recvd = server.received_requests().await.unwrap_or_default();
2055 assert_eq!(recvd[0].method.as_str(), "GET");
2056 }
2057
2058 #[tokio::test]
2059 async fn protection_field_routes_through_browser_like_bot_protected_tag() {
2060 let server = MockServer::start().await;
2065 Mock::given(any())
2066 .respond_with(ResponseTemplate::new(200))
2067 .mount(&server)
2068 .await;
2069 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2070 site.protection = vec![crate::site::ProtectionKind::Cloudflare];
2071 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2073 status: 200,
2074 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2075 body: String::new(),
2076 elapsed_ms: 0,
2077 }));
2078 let client = Client::builder()
2079 .min_request_interval(Duration::ZERO)
2080 .max_retries(0)
2081 .browser(backend)
2082 .build()
2083 .unwrap();
2084 let outcome = client.check(&site, &user()).await;
2085 assert_eq!(outcome.kind, MatchKind::Found);
2088 let recvd = server.received_requests().await.unwrap_or_default();
2090 assert_eq!(
2091 recvd.len(),
2092 0,
2093 "structured protection must skip the raw HTTP path"
2094 );
2095 }
2096
2097 #[tokio::test]
2098 async fn post_method_sends_body_with_username_substituted() {
2099 let server = MockServer::start().await;
2103 Mock::given(method("POST"))
2104 .and(path("/api"))
2105 .respond_with(ResponseTemplate::new(200))
2106 .mount(&server)
2107 .await;
2108 let site = Site {
2113 name: "ApiPost".into(),
2114 url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
2115 signals: vec![Signal::StatusFound { codes: vec![200] }],
2116 known_present: None,
2117 known_absent: None,
2118 extract: Vec::new(),
2119 tags: Vec::new(),
2120 request_headers: std::collections::BTreeMap::new(),
2121 regex_check: None,
2122 engine: None,
2123 strip_bad_char: None,
2124 request_method: HttpMethod::Post,
2125 request_body: Some(r#"{"name":"{username}"}"#.into()),
2126 protection: Vec::new(),
2127 disabled: false,
2128 source: None,
2129 popularity: None,
2130 access: crate::AccessPolicy::default(),
2131 };
2132 let outcome = build_client().check(&site, &user()).await;
2133 assert_eq!(outcome.kind, MatchKind::Found);
2134 let recvd = server.received_requests().await.unwrap_or_default();
2135 assert_eq!(recvd.len(), 1);
2136 assert_eq!(recvd[0].method.as_str(), "POST");
2137 let body = String::from_utf8_lossy(&recvd[0].body).to_string();
2138 assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
2139 }
2140
2141 #[tokio::test]
2142 async fn head_405_falls_back_to_get() {
2143 let server = MockServer::start().await;
2146 Mock::given(method("HEAD"))
2147 .and(path("/alice"))
2148 .respond_with(ResponseTemplate::new(405))
2149 .mount(&server)
2150 .await;
2151 Mock::given(any())
2152 .and(path("/alice"))
2153 .respond_with(ResponseTemplate::new(200))
2154 .mount(&server)
2155 .await;
2156 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2157 let outcome = build_client().check(&site, &user()).await;
2158 assert_eq!(outcome.kind, MatchKind::Found);
2159 let recvd = server.received_requests().await.unwrap_or_default();
2160 assert_eq!(recvd.len(), 2);
2161 assert_eq!(recvd[0].method.as_str(), "HEAD");
2162 assert_eq!(recvd[1].method.as_str(), "GET");
2163 }
2164
2165 async fn cloudflare_503_server() -> MockServer {
2174 let server = MockServer::start().await;
2175 Mock::given(any())
2176 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
2177 .mount(&server)
2178 .await;
2179 server
2180 }
2181
2182 #[tokio::test]
2183 async fn http_success_stamps_http_transport_no_escalations() {
2184 let server = MockServer::start().await;
2185 Mock::given(any())
2186 .respond_with(ResponseTemplate::new(200))
2187 .mount(&server)
2188 .await;
2189 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2190 let outcome = build_client().check(&site, &user()).await;
2191 assert_eq!(outcome.kind, MatchKind::Found);
2192 assert_eq!(
2193 outcome.transport,
2194 Some(crate::escalation::TransportTier::Http),
2195 "successful HTTP probe must stamp Http transport"
2196 );
2197 assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
2198 }
2199
2200 #[tokio::test]
2201 async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
2202 let server = cloudflare_503_server().await;
2203 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2205 status: 200,
2206 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2207 body: String::new(),
2208 elapsed_ms: 5,
2209 }));
2210 let client = Client::builder()
2211 .min_request_interval(Duration::ZERO)
2212 .max_retries(0)
2213 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2214 .build()
2215 .unwrap();
2216 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2219 let outcome = client.check(&site, &user()).await;
2220 assert_eq!(
2221 outcome.kind,
2222 MatchKind::Found,
2223 "escalation should flip CF challenge to Found via browser (reason {:?})",
2224 outcome.reason
2225 );
2226 assert_eq!(
2227 outcome.transport,
2228 Some(crate::escalation::TransportTier::Browser),
2229 "escalated outcome must be stamped Browser"
2230 );
2231 assert_eq!(
2232 outcome.escalations, 1,
2233 "exactly one escalation should have fired"
2234 );
2235 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
2236 }
2237
2238 #[tokio::test]
2239 async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
2240 let server = cloudflare_503_server().await;
2241 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2242 status: 200,
2243 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2244 body: String::new(),
2245 elapsed_ms: 0,
2246 }));
2247 let client = Client::builder()
2248 .min_request_interval(Duration::ZERO)
2249 .max_retries(0)
2250 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2251 .disable_escalation()
2252 .build()
2253 .unwrap();
2254 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2255 let outcome = client.check(&site, &user()).await;
2256 assert_eq!(outcome.kind, MatchKind::Uncertain);
2257 assert!(matches!(
2258 outcome.reason,
2259 Some(UncertainReason::CloudflareChallenge)
2260 ));
2261 assert_eq!(
2262 outcome.transport,
2263 Some(crate::escalation::TransportTier::Http),
2264 "primary transport must still be stamped"
2265 );
2266 assert_eq!(outcome.escalations, 0);
2267 assert_eq!(
2268 backend.call_count(),
2269 0,
2270 "browser must not be touched when --no-escalation"
2271 );
2272 }
2273
2274 #[tokio::test]
2275 async fn escalation_budget_zero_keeps_browser_untouched() {
2276 let server = cloudflare_503_server().await;
2277 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2278 status: 200,
2279 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2280 body: String::new(),
2281 elapsed_ms: 0,
2282 }));
2283 let client = Client::builder()
2284 .min_request_interval(Duration::ZERO)
2285 .max_retries(0)
2286 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2287 .escalation_budget(0)
2288 .build()
2289 .unwrap();
2290 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2291 let outcome = client.check(&site, &user()).await;
2292 assert_eq!(outcome.kind, MatchKind::Uncertain);
2293 assert!(matches!(
2294 outcome.reason,
2295 Some(UncertainReason::CloudflareChallenge)
2296 ));
2297 assert_eq!(outcome.escalations, 0);
2298 assert_eq!(
2299 backend.call_count(),
2300 0,
2301 "zero budget must deny every escalation"
2302 );
2303 }
2304
2305 #[tokio::test]
2306 async fn escalation_consumes_budget_then_stops() {
2307 let server = cloudflare_503_server().await;
2308 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2309 status: 200,
2310 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2311 body: String::new(),
2312 elapsed_ms: 0,
2313 }));
2314 let client = Client::builder()
2315 .min_request_interval(Duration::ZERO)
2316 .max_retries(0)
2317 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2318 .escalation_budget(1)
2319 .build()
2320 .unwrap();
2321 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2322 let first = client.check(&site, &user()).await;
2324 assert_eq!(first.kind, MatchKind::Found);
2325 assert_eq!(first.escalations, 1);
2326 let second = client.check(&site, &user()).await;
2328 assert_eq!(second.kind, MatchKind::Uncertain);
2329 assert!(matches!(
2330 second.reason,
2331 Some(UncertainReason::CloudflareChallenge)
2332 ));
2333 assert_eq!(second.escalations, 0);
2334 assert_eq!(backend.call_count(), 1, "browser called exactly once total");
2335 }
2336}