1use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41#[derive(Clone)]
49pub struct Client {
50 http: Arc<HttpFetcher>,
51 egress: Arc<EgressPool>,
54 sessions: Arc<SessionStore>,
57 throttle: HostThrottle,
58 global_throttle: Option<HostThrottle>,
60 retry: RetryPolicy,
61 user_agents: Arc<[String]>,
64 enrich: bool,
66 robots: Option<RobotsCache>,
68 browser: Option<Arc<dyn BrowserBackend>>,
71 #[cfg(feature = "impersonate")]
75 impersonate: Option<Arc<ImpersonateFetcher>>,
76 browser_budget: Arc<BrowserBudget>,
79 escalation_budget: Arc<crate::escalation::EscalationBudget>,
85 escalation_enabled: bool,
89}
90
91impl Client {
92 pub fn builder() -> ClientBuilder {
94 ClientBuilder::default()
95 }
96
97 #[must_use]
102 pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
103 self.egress.summary()
104 }
105
106 #[must_use]
110 pub fn session_names(&self) -> Vec<String> {
111 self.sessions.names()
112 }
113
114 #[must_use]
118 pub fn egress_names(&self) -> Vec<String> {
119 self.egress.names()
120 }
121
122 #[must_use]
135 pub fn with_egress_subset(&self, names: &[String]) -> Self {
136 Self {
137 http: Arc::clone(&self.http),
138 egress: Arc::new(self.egress.subset(names)),
139 sessions: Arc::clone(&self.sessions),
140 throttle: self.throttle.clone(),
141 global_throttle: self.global_throttle.clone(),
142 retry: self.retry.clone(),
143 user_agents: Arc::clone(&self.user_agents),
144 enrich: self.enrich,
145 robots: self.robots.clone(),
146 browser: self.browser.clone(),
147 #[cfg(feature = "impersonate")]
148 impersonate: self.impersonate.clone(),
149 browser_budget: Arc::clone(&self.browser_budget),
150 escalation_budget: Arc::clone(&self.escalation_budget),
151 escalation_enabled: self.escalation_enabled,
152 }
153 }
154
155 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
169 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
170 let mut attempt: u32 = 0;
171 loop {
172 let outcome = self.probe_once(site, username).await;
173 if !retry::should_retry(&outcome, attempt, &self.retry) {
174 return outcome;
175 }
176 let delay = retry::backoff_delay(attempt, &self.retry);
177 tracing::info!(
178 site = %site.name,
179 attempt = attempt + 1,
180 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
181 ?delay,
182 "transient ban, retrying",
183 );
184 tokio::time::sleep(delay).await;
185 attempt += 1;
186 }
187 }
188
189 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
198 let host = host_of(url);
199 if let Some(global) = &self.global_throttle {
200 global.wait(GLOBAL_THROTTLE_KEY).await;
201 }
202 self.throttle.wait(&host).await;
203 let mut request = self.http.client().get(url);
204 if let Some(ua) = self.pick_user_agent() {
205 request = request.header(reqwest::header::USER_AGENT, ua);
206 }
207 let response = request.send().await.ok()?;
208 let status = response.status().as_u16();
209 let final_url = response.url().to_string();
210 let body = response.text().await.unwrap_or_default();
211 Some(RawResponse {
212 status,
213 final_url,
214 body,
215 })
216 }
217
218 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
229 if let Some(backend) = self.browser.as_deref() {
230 let has_tag = site
231 .tags
232 .iter()
233 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
234 if has_tag || !site.protection.is_empty() {
235 let parsed = url::Url::parse(url).ok()?;
236 match backend
237 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
238 .await
239 {
240 Ok(page) => {
241 return Some(RawResponse {
242 status: page.status,
243 final_url: page.final_url.to_string(),
244 body: page.body,
245 });
246 }
247 Err(err) => {
248 tracing::warn!(
249 site = %site.name, %url, error = %err,
250 "browser fetch failed in doctor; falling back to raw HTTP",
251 );
252 }
253 }
254 }
255 }
256 self.fetch(url).await
257 }
258
259 fn pick_user_agent(&self) -> Option<&str> {
262 match self.user_agents.len() {
263 0 => None,
264 1 => Some(&self.user_agents[0]),
265 n => Some(&self.user_agents[fastrand::usize(0..n)]),
266 }
267 }
268
269 #[allow(clippy::too_many_lines)]
272 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
273 let url = site.url_for(username);
274
275 if let Some(pat) = &site.regex_check {
285 if let Ok(re) = regex::Regex::new(pat) {
286 if !re.is_match(username.as_str()) {
287 return uncertain(
288 &site.name,
289 url,
290 Instant::now(),
291 UncertainReason::UsernameNotAllowed,
292 );
293 }
294 }
295 }
296
297 let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
304 None => Cow::Borrowed(&site.request_headers),
305 Some(name) => match self.sessions.get(name) {
306 Some(session) => Cow::Owned(session.apply(&site.request_headers)),
307 None => {
308 return uncertain(
309 &site.name,
310 url,
311 Instant::now(),
312 UncertainReason::SessionRequired,
313 );
314 }
315 },
316 };
317 let headers: &BTreeMap<String, String> = &session_headers;
318
319 if let Some(backend) = &self.browser {
326 let has_tag = site
327 .tags
328 .iter()
329 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
330 if has_tag || !site.protection.is_empty() {
331 if self.browser_budget.try_consume() {
332 let started = Instant::now();
333 let req = FetchRequest {
334 method: site.request_method,
335 url: &url,
336 body: None,
337 user_agent: None,
338 headers,
339 want_body: true,
340 };
341 let fetcher = BrowserFetcher::new(Arc::clone(backend));
342 let mut outcome = match fetcher.fetch(&req).await {
343 Ok(resp) => self.finish(site, url, started, &resp),
344 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
345 };
346 outcome.transport = Some(crate::escalation::TransportTier::Browser);
347 return outcome;
348 }
349 tracing::warn!(site = %site.name, "browser budget exhausted");
350 let mut outcome = uncertain(
351 &site.name,
352 url,
353 Instant::now(),
354 UncertainReason::BrowserBudget,
355 );
356 outcome.transport = Some(crate::escalation::TransportTier::Browser);
357 return outcome;
358 }
359 }
360
361 #[cfg(feature = "impersonate")]
368 if let Some(fetcher) = &self.impersonate {
369 let pure_tls = site.protection.len() == 1
370 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
371 && !site
372 .tags
373 .iter()
374 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
375 if pure_tls {
376 let started = Instant::now();
377 let req = FetchRequest {
378 method: site.request_method,
379 url: &url,
380 body: None,
381 user_agent: self.pick_user_agent(),
382 headers,
383 want_body: true,
384 };
385 let mut primary = match fetcher.fetch(&req).await {
386 Ok(resp) => self.finish(site, url.clone(), started, &resp),
387 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
388 };
389 primary.transport = Some(crate::escalation::TransportTier::Impersonate);
390 return self.maybe_escalate(site, &url, headers, primary).await;
391 }
392 }
393
394 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
401 EgressChoice::Default => Arc::clone(&self.http),
402 EgressChoice::Use(fetcher) => fetcher,
403 EgressChoice::Unavailable => {
404 return uncertain(
405 &site.name,
406 url,
407 Instant::now(),
408 UncertainReason::GeoUnavailable,
409 );
410 }
411 };
412
413 let host = host_of(&url);
414
415 if let Some(robots) = &self.robots {
417 if let Some((origin, path)) = origin_and_path(&url) {
418 if !robots.allowed(&origin, &path).await {
419 tracing::debug!(%url, "skipped by robots.txt");
420 return uncertain(
421 &site.name,
422 url,
423 Instant::now(),
424 UncertainReason::RobotsDisallowed,
425 );
426 }
427 }
428 }
429
430 if let Some(global) = &self.global_throttle {
432 global.wait(GLOBAL_THROTTLE_KEY).await;
433 }
434 self.throttle.wait(&host).await;
435 let started = Instant::now();
436 tracing::debug!(%url, %host, "probing");
437
438 let want_enrich = self.enrich && !site.extract.is_empty();
441 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
442
443 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
448 const USERNAME_PH: &str = "{username}";
449 site.request_body
450 .as_deref()
451 .map(|t| t.replace(USERNAME_PH, username.as_str()))
452 } else {
453 None
454 };
455
456 let req = FetchRequest {
457 method: site.request_method,
458 url: &url,
459 body: body_for_post.as_deref(),
460 user_agent: self.pick_user_agent(),
461 headers,
462 want_body: needs_body,
463 };
464 let mut primary = match egress.fetch(&req).await {
465 Ok(resp) => self.finish(site, url.clone(), started, &resp),
466 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
467 };
468 primary.transport = Some(crate::escalation::TransportTier::Http);
469 self.maybe_escalate(site, &url, headers, primary).await
470 }
471
472 async fn maybe_escalate(
477 &self,
478 site: &Site,
479 url: &str,
480 headers: &BTreeMap<String, String>,
481 primary: CheckOutcome,
482 ) -> CheckOutcome {
483 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
484 return primary;
485 }
486 let Some(reason) = &primary.reason else {
487 return primary;
488 };
489 if !crate::escalation::should_escalate(reason) {
490 return primary;
491 }
492 let Some(backend) = &self.browser else {
493 return primary;
494 };
495 if !self.escalation_budget.try_consume() {
496 tracing::debug!(site = %site.name, "escalation budget exhausted");
497 return primary;
498 }
499
500 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
501 let started = Instant::now();
502 let req = FetchRequest {
503 method: site.request_method,
504 url,
505 body: None,
506 user_agent: None,
507 headers,
508 want_body: true,
509 };
510 let fetcher = BrowserFetcher::new(Arc::clone(backend));
511 let mut escalated = match fetcher.fetch(&req).await {
512 Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
513 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
514 };
515 escalated.transport = Some(crate::escalation::TransportTier::Browser);
516 escalated.escalations = 1;
517 escalated
518 }
519
520 fn finish(
524 &self,
525 site: &Site,
526 url: String,
527 started: Instant,
528 resp: &crate::transport::FetchResponse,
529 ) -> CheckOutcome {
530 let probe = Probe {
531 status: resp.status,
532 final_url: &resp.final_url,
533 body: &resp.body,
534 };
535 let votes: Vec<(&Signal, SignalVerdict)> = site
536 .signals
537 .iter()
538 .map(|s| (s, s.evaluate(&probe)))
539 .collect();
540 let kind = aggregate(votes.iter().map(|(_, v)| *v));
541 let mut result = outcome(&site.name, url, started, kind);
542 let winning = match kind {
544 MatchKind::Found => Some(SignalVerdict::Found),
545 MatchKind::NotFound => Some(SignalVerdict::NotFound),
546 MatchKind::Uncertain => None,
547 };
548 if let Some(want) = winning {
549 result.evidence = votes
550 .iter()
551 .filter(|(_, v)| *v == want)
552 .map(|(s, _)| s.describe_match(&probe))
553 .collect();
554 }
555 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
556 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
557 }
558 result
559 }
560}
561
562#[derive(Debug, Clone)]
564pub struct RawResponse {
565 pub status: u16,
567 pub final_url: String,
569 pub body: String,
571}
572
573#[derive(Clone)]
575#[must_use = "ClientBuilder does nothing until `.build()` is called"]
576#[allow(clippy::struct_excessive_bools)]
581pub struct ClientBuilder {
582 timeout: Duration,
583 connect_timeout: Duration,
584 user_agent: String,
585 follow_redirects: bool,
586 redirect_limit: usize,
587 min_request_interval: Duration,
588 max_rps: Option<NonZeroU32>,
589 retry: RetryPolicy,
590 proxy: Option<String>,
591 user_agents: Vec<String>,
592 enrich: bool,
593 respect_robots: bool,
594 browser: Option<Arc<dyn BrowserBackend>>,
595 browser_budget: usize,
596 egress: Vec<EgressSpec>,
597 sessions: SessionStore,
598 escalation_budget: usize,
599 escalation_enabled: bool,
600}
601
602impl Default for ClientBuilder {
603 fn default() -> Self {
604 Self {
605 timeout: DEFAULT_TIMEOUT,
606 connect_timeout: DEFAULT_CONNECT_TIMEOUT,
607 user_agent: default_user_agent(),
608 follow_redirects: true,
609 redirect_limit: DEFAULT_REDIRECT_LIMIT,
610 min_request_interval: DEFAULT_PER_HOST_INTERVAL,
611 max_rps: None,
612 retry: RetryPolicy::default(),
613 proxy: None,
614 user_agents: Vec::new(),
615 enrich: false,
616 respect_robots: false,
617 browser: None,
618 browser_budget: DEFAULT_BROWSER_BUDGET,
619 egress: Vec::new(),
620 sessions: SessionStore::new(),
621 escalation_budget: DEFAULT_ESCALATION_BUDGET,
622 escalation_enabled: true,
623 }
624 }
625}
626
627impl ClientBuilder {
628 pub fn timeout(mut self, timeout: Duration) -> Self {
630 self.timeout = timeout;
631 self
632 }
633
634 pub fn connect_timeout(mut self, timeout: Duration) -> Self {
636 self.connect_timeout = timeout;
637 self
638 }
639
640 pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
642 self.user_agent = user_agent.into();
643 self
644 }
645
646 pub fn follow_redirects(mut self, follow: bool) -> Self {
649 self.follow_redirects = follow;
650 self
651 }
652
653 pub fn min_request_interval(mut self, interval: Duration) -> Self {
659 self.min_request_interval = interval;
660 self
661 }
662
663 pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
668 self.max_rps = Some(rps);
669 self
670 }
671
672 pub fn max_retries(mut self, n: u32) -> Self {
675 self.retry.max_retries = n;
676 self
677 }
678
679 pub fn base_backoff_delay(mut self, d: Duration) -> Self {
682 self.retry.base_delay = d;
683 self
684 }
685
686 pub fn max_backoff_delay(mut self, d: Duration) -> Self {
688 self.retry.max_delay = d;
689 self
690 }
691
692 pub fn proxy(mut self, url: impl Into<String>) -> Self {
695 self.proxy = Some(url.into());
696 self
697 }
698
699 pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
703 self.user_agents = agents;
704 self
705 }
706
707 pub fn enrich(mut self, enrich: bool) -> Self {
710 self.enrich = enrich;
711 self
712 }
713
714 pub fn respect_robots(mut self, respect: bool) -> Self {
718 self.respect_robots = respect;
719 self
720 }
721
722 pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
726 self.browser = Some(backend);
727 self
728 }
729
730 pub const fn browser_budget(mut self, cap: usize) -> Self {
735 self.browser_budget = cap;
736 self
737 }
738
739 pub const fn escalation_budget(mut self, cap: usize) -> Self {
746 self.escalation_budget = cap;
747 self
748 }
749
750 pub const fn disable_escalation(mut self) -> Self {
755 self.escalation_enabled = false;
756 self
757 }
758
759 pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
764 self.egress = egress;
765 self
766 }
767
768 pub fn sessions(mut self, sessions: SessionStore) -> Self {
774 self.sessions = sessions;
775 self
776 }
777
778 pub fn build(self) -> Result<Client> {
780 let inner = build_reqwest(
781 &self.user_agent,
782 self.timeout,
783 self.connect_timeout,
784 self.follow_redirects,
785 self.redirect_limit,
786 self.proxy.as_deref(),
787 )?;
788
789 let mut egress_entries = Vec::with_capacity(self.egress.len());
793 for spec in &self.egress {
794 let client = build_reqwest(
795 &self.user_agent,
796 self.timeout,
797 self.connect_timeout,
798 self.follow_redirects,
799 self.redirect_limit,
800 Some(&spec.url),
801 )?;
802 egress_entries.push((
803 spec.name.clone(),
804 spec.country.clone(),
805 spec.kind,
806 Arc::new(HttpFetcher::new(client)),
807 ));
808 }
809
810 let global_throttle = self.max_rps.map(|rps| {
811 let interval = Duration::from_secs(1) / rps.get();
813 HostThrottle::new(interval)
814 });
815 let robots = self
816 .respect_robots
817 .then(|| RobotsCache::new(inner.clone(), "adler"));
818 #[cfg(feature = "impersonate")]
822 let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
823 Ok(Client {
824 http: Arc::new(HttpFetcher::new(inner)),
825 egress: Arc::new(EgressPool::new(egress_entries)),
826 sessions: Arc::new(self.sessions),
827 throttle: HostThrottle::new(self.min_request_interval),
828 global_throttle,
829 retry: self.retry,
830 user_agents: Arc::from(self.user_agents),
831 enrich: self.enrich,
832 robots,
833 browser: self.browser,
834 browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
835 escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
836 self.escalation_budget,
837 )),
838 escalation_enabled: self.escalation_enabled,
839 #[cfg(feature = "impersonate")]
840 impersonate,
841 })
842 }
843}
844
845fn build_reqwest(
849 user_agent: &str,
850 timeout: Duration,
851 connect_timeout: Duration,
852 follow_redirects: bool,
853 redirect_limit: usize,
854 proxy: Option<&str>,
855) -> Result<reqwest::Client> {
856 let redirect_policy = if follow_redirects {
857 redirect::Policy::limited(redirect_limit)
858 } else {
859 redirect::Policy::none()
860 };
861 let mut builder = reqwest::Client::builder()
862 .user_agent(user_agent.to_owned())
863 .timeout(timeout)
864 .connect_timeout(connect_timeout)
865 .redirect(redirect_policy);
866 if let Some(proxy_url) = proxy {
867 const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
871 if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
872 return Err(Error::HttpSetup {
873 message: format!(
874 "invalid proxy {proxy_url:?}: must start with one of {}",
875 SCHEMES.join(", ")
876 ),
877 });
878 }
879 let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
880 message: format!("invalid proxy {proxy_url:?}: {e}"),
881 })?;
882 builder = builder.proxy(proxy);
883 }
884 builder.build().map_err(|e| Error::HttpSetup {
885 message: e.to_string(),
886 })
887}
888
889pub const DEFAULT_BROWSER_BUDGET: usize = 50;
896
897pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
907
908impl fmt::Debug for Client {
909 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
910 f.debug_struct("Client")
911 .field("throttle", &self.throttle)
912 .field("global_throttle", &self.global_throttle)
913 .field("retry", &self.retry)
914 .field("user_agents", &self.user_agents)
915 .field("enrich", &self.enrich)
916 .field("robots", &self.robots.is_some())
917 .field("browser", &self.browser.is_some())
918 .field("browser_budget", &self.browser_budget)
919 .field("escalation_budget", &self.escalation_budget)
920 .field("escalation_enabled", &self.escalation_enabled)
921 .finish_non_exhaustive()
922 }
923}
924
925impl fmt::Debug for ClientBuilder {
926 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
927 f.debug_struct("ClientBuilder")
928 .field("timeout", &self.timeout)
929 .field("connect_timeout", &self.connect_timeout)
930 .field("user_agent", &self.user_agent)
931 .field("follow_redirects", &self.follow_redirects)
932 .field("redirect_limit", &self.redirect_limit)
933 .field("min_request_interval", &self.min_request_interval)
934 .field("max_rps", &self.max_rps)
935 .field("retry", &self.retry)
936 .field("proxy", &self.proxy)
937 .field("user_agents", &self.user_agents)
938 .field("enrich", &self.enrich)
939 .field("respect_robots", &self.respect_robots)
940 .field("browser", &self.browser.is_some())
941 .field("browser_budget", &self.browser_budget)
942 .field("egress", &self.egress)
943 .field("sessions", &self.sessions)
944 .field("escalation_budget", &self.escalation_budget)
945 .field("escalation_enabled", &self.escalation_enabled)
946 .finish()
947 }
948}
949
950const BOT_PROTECTED_TAG: &str = "bot-protected";
951
952fn default_user_agent() -> String {
953 format!("adler/{}", env!("CARGO_PKG_VERSION"))
954}
955
956fn host_of(url: &str) -> String {
957 reqwest::Url::parse(url)
958 .ok()
959 .and_then(|u| u.host_str().map(str::to_owned))
960 .unwrap_or_else(|| "unknown".into())
961}
962
963fn origin_and_path(url: &str) -> Option<(String, String)> {
966 let parsed = reqwest::Url::parse(url).ok()?;
967 let host = parsed.host_str()?;
968 let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
969 let origin = format!("{}://{host}{port}", parsed.scheme());
970 let path = parsed.query().map_or_else(
971 || parsed.path().to_owned(),
972 |q| format!("{}?{q}", parsed.path()),
973 );
974 Some((origin, path))
975}
976
977fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
978 CheckOutcome {
979 site: site.to_owned(),
980 url,
981 kind,
982 reason: None,
983 elapsed_ms: elapsed_ms(started),
984 enrichment: std::collections::BTreeMap::new(),
985 evidence: Vec::new(),
986 transport: None,
987 escalations: 0,
988 }
989}
990
991fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
992 CheckOutcome {
993 site: site.to_owned(),
994 url,
995 kind: MatchKind::Uncertain,
996 reason: Some(reason),
997 elapsed_ms: elapsed_ms(started),
998 enrichment: std::collections::BTreeMap::new(),
999 evidence: Vec::new(),
1000 transport: None,
1001 escalations: 0,
1002 }
1003}
1004
1005fn elapsed_ms(started: Instant) -> u64 {
1006 u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011 use super::*;
1012 use crate::browser::RenderedPage;
1013 use crate::site::{Signal, UrlTemplate};
1014 use wiremock::matchers::{any, method, path};
1015 use wiremock::{Mock, MockServer, ResponseTemplate};
1016
1017 fn build_client() -> Client {
1018 Client::builder()
1019 .timeout(Duration::from_secs(2))
1020 .min_request_interval(Duration::ZERO)
1023 .max_retries(0)
1026 .build()
1027 .expect("client builds")
1028 }
1029
1030 fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
1031 Site {
1032 name: "Mock".into(),
1033 url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
1034 signals,
1035 known_present: None,
1036 known_absent: None,
1037 extract: Vec::new(),
1038 tags: Vec::new(),
1039 request_headers: std::collections::BTreeMap::new(),
1040 regex_check: None,
1041 engine: None,
1042 strip_bad_char: None,
1043 request_method: crate::site::HttpMethod::Get,
1044 request_body: None,
1045 protection: Vec::new(),
1046 disabled: false,
1047 disabled_reason: None,
1048 source: None,
1049 popularity: None,
1050 access: crate::AccessPolicy::default(),
1051 }
1052 }
1053
1054 fn user() -> Username {
1055 Username::new("alice").unwrap()
1056 }
1057
1058 #[tokio::test]
1059 async fn regex_check_short_circuits_before_any_request() {
1060 let server = MockServer::start().await;
1064 Mock::given(any())
1065 .respond_with(ResponseTemplate::new(200))
1066 .mount(&server)
1067 .await;
1068 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1069 site.regex_check = Some("^[A-Za-z]{8,}$".into());
1071 let outcome = build_client().check(&site, &user()).await;
1072 assert_eq!(outcome.kind, MatchKind::Uncertain);
1073 assert!(
1074 matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
1075 "expected UsernameNotAllowed, got {:?}",
1076 outcome.reason,
1077 );
1078 let recvd = server.received_requests().await.unwrap_or_default();
1081 assert_eq!(
1082 recvd.len(),
1083 0,
1084 "regex_check mismatch must skip the HTTP request entirely"
1085 );
1086 }
1087
1088 #[tokio::test]
1089 async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
1090 let server = MockServer::start().await;
1093 Mock::given(any())
1094 .respond_with(ResponseTemplate::new(200))
1095 .mount(&server)
1096 .await;
1097 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1098 site.access = crate::access::AccessPolicy {
1101 geo: vec![crate::access::CountryCode::new("pl").unwrap()],
1102 ..crate::access::AccessPolicy::default()
1103 };
1104 let outcome = build_client().check(&site, &user()).await;
1105 assert_eq!(outcome.kind, MatchKind::Uncertain);
1106 assert!(
1107 matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
1108 "expected GeoUnavailable, got {:?}",
1109 outcome.reason,
1110 );
1111 let recvd = server.received_requests().await.unwrap_or_default();
1114 assert_eq!(
1115 recvd.len(),
1116 0,
1117 "geo-unavailable must skip the HTTP request entirely"
1118 );
1119 }
1120
1121 #[tokio::test]
1122 async fn session_headers_are_sent_on_probe() {
1123 let server = MockServer::start().await;
1126 Mock::given(any())
1127 .and(wiremock::matchers::header("cookie", "sessionid=real"))
1128 .respond_with(ResponseTemplate::new(200))
1129 .mount(&server)
1130 .await;
1131 let mut headers = std::collections::BTreeMap::new();
1132 headers.insert("Cookie".to_string(), "sessionid=real".to_string());
1133 let mut store = SessionStore::new();
1134 store.insert("acct", crate::access::Session::from_headers(headers));
1135 let client = Client::builder()
1136 .timeout(Duration::from_secs(2))
1137 .min_request_interval(Duration::ZERO)
1138 .max_retries(0)
1139 .sessions(store)
1140 .build()
1141 .expect("client builds");
1142 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1143 site.access.session = Some("acct".to_string());
1144 let outcome = client.check(&site, &user()).await;
1145 assert_eq!(
1146 outcome.kind,
1147 MatchKind::Found,
1148 "session cookie should unlock the 200 (got {:?})",
1149 outcome.reason,
1150 );
1151 }
1152
1153 #[tokio::test]
1154 async fn missing_named_session_is_session_required() {
1155 let server = MockServer::start().await;
1156 Mock::given(any())
1157 .respond_with(ResponseTemplate::new(200))
1158 .mount(&server)
1159 .await;
1160 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1161 site.access.session = Some("not-configured".to_string());
1163 let outcome = build_client().check(&site, &user()).await;
1164 assert_eq!(outcome.kind, MatchKind::Uncertain);
1165 assert!(
1166 matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
1167 "expected SessionRequired, got {:?}",
1168 outcome.reason,
1169 );
1170 let recvd = server.received_requests().await.unwrap_or_default();
1171 assert_eq!(
1172 recvd.len(),
1173 0,
1174 "a missing session must skip the request, not probe unauthenticated"
1175 );
1176 }
1177
1178 #[cfg(feature = "impersonate")]
1179 #[tokio::test]
1180 async fn impersonate_routes_pure_tls_fingerprint_site() {
1181 let server = MockServer::start().await;
1182 Mock::given(any())
1183 .respond_with(ResponseTemplate::new(200))
1184 .mount(&server)
1185 .await;
1186 let client = Client::builder()
1187 .timeout(Duration::from_secs(2))
1188 .min_request_interval(Duration::ZERO)
1189 .max_retries(0)
1190 .build()
1191 .expect("client builds with impersonate");
1192 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1193 site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1196 let outcome = client.check(&site, &user()).await;
1197 assert_eq!(
1198 outcome.kind,
1199 MatchKind::Found,
1200 "expected Found (reason {:?})",
1201 outcome.reason,
1202 );
1203 let recvd = server.received_requests().await.expect("received requests");
1207 assert_eq!(recvd.len(), 1, "expected exactly one request");
1208 let ua = recvd[0]
1209 .headers
1210 .get("user-agent")
1211 .and_then(|v| v.to_str().ok())
1212 .unwrap_or("");
1213 assert!(
1214 ua.contains("Chrome/"),
1215 "expected Chrome-shaped UA from wreq, got {ua:?}"
1216 );
1217 }
1218
1219 #[tokio::test]
1220 async fn regex_check_pass_proceeds_to_probe() {
1221 let server = MockServer::start().await;
1222 Mock::given(any())
1223 .and(path("/alice"))
1224 .respond_with(ResponseTemplate::new(200))
1225 .mount(&server)
1226 .await;
1227 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1228 site.regex_check = Some("^[a-z]{3,}$".into());
1230 let outcome = build_client().check(&site, &user()).await;
1231 assert_eq!(outcome.kind, MatchKind::Found);
1232 }
1233
1234 #[tokio::test]
1235 async fn status_signal_reports_found_on_match() {
1236 let server = MockServer::start().await;
1237 Mock::given(any())
1238 .and(path("/alice"))
1239 .respond_with(ResponseTemplate::new(200))
1240 .mount(&server)
1241 .await;
1242 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1243 let outcome = build_client().check(&site, &user()).await;
1244 assert_eq!(outcome.kind, MatchKind::Found);
1245 assert!(outcome.url.ends_with("/alice"));
1246 assert!(outcome.reason.is_none());
1247 assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1248 }
1249
1250 #[tokio::test]
1251 async fn status_signal_pair_reports_not_found_on_404() {
1252 let server = MockServer::start().await;
1253 Mock::given(any())
1254 .and(path("/alice"))
1255 .respond_with(ResponseTemplate::new(404))
1256 .mount(&server)
1257 .await;
1258 let site = site_with(
1259 &server,
1260 vec![
1261 Signal::StatusFound { codes: vec![200] },
1262 Signal::StatusNotFound { codes: vec![404] },
1263 ],
1264 );
1265 let outcome = build_client().check(&site, &user()).await;
1266 assert_eq!(outcome.kind, MatchKind::NotFound);
1267 assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1269 }
1270
1271 #[tokio::test]
1272 async fn body_absent_signal_detects_missing_account() {
1273 let server = MockServer::start().await;
1274 Mock::given(any())
1275 .and(path("/alice"))
1276 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1277 .mount(&server)
1278 .await;
1279 let site = site_with(
1280 &server,
1281 vec![Signal::BodyAbsent {
1282 text: "Profile not found".into(),
1283 }],
1284 );
1285 let outcome = build_client().check(&site, &user()).await;
1286 assert_eq!(outcome.kind, MatchKind::NotFound);
1287 }
1288
1289 #[tokio::test]
1290 async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1291 let server = MockServer::start().await;
1294 Mock::given(any())
1295 .and(path("/alice"))
1296 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1297 .mount(&server)
1298 .await;
1299 let site = site_with(
1300 &server,
1301 vec![Signal::BodyAbsent {
1302 text: "Profile not found".into(),
1303 }],
1304 );
1305 let outcome = build_client().check(&site, &user()).await;
1306 assert_eq!(outcome.kind, MatchKind::Uncertain);
1307 }
1308
1309 #[tokio::test]
1310 async fn body_present_plus_absent_resolve_to_found() {
1311 let server = MockServer::start().await;
1312 Mock::given(any())
1313 .and(path("/alice"))
1314 .respond_with(
1315 ResponseTemplate::new(200)
1316 .set_body_string(r#"<div class="profile-card">alice</div>"#),
1317 )
1318 .mount(&server)
1319 .await;
1320 let site = site_with(
1321 &server,
1322 vec![
1323 Signal::BodyPresent {
1324 text: "profile-card".into(),
1325 },
1326 Signal::BodyAbsent {
1327 text: "Profile not found".into(),
1328 },
1329 ],
1330 );
1331 let outcome = build_client().check(&site, &user()).await;
1332 assert_eq!(outcome.kind, MatchKind::Found);
1333 }
1334
1335 #[tokio::test]
1336 async fn redirect_absent_signal_detects_missing_account() {
1337 let server = MockServer::start().await;
1338 Mock::given(any())
1339 .and(path("/alice"))
1340 .respond_with(
1341 ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1342 )
1343 .mount(&server)
1344 .await;
1345 Mock::given(any())
1346 .and(path("/login"))
1347 .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1348 .mount(&server)
1349 .await;
1350 let site = site_with(
1351 &server,
1352 vec![Signal::RedirectAbsent {
1353 fragment: "/login".into(),
1354 }],
1355 );
1356 let outcome = build_client().check(&site, &user()).await;
1357 assert_eq!(outcome.kind, MatchKind::NotFound);
1358 }
1359
1360 #[tokio::test]
1361 async fn negative_signal_wins_over_positive() {
1362 let server = MockServer::start().await;
1367 Mock::given(any())
1368 .and(path("/alice"))
1369 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1370 .mount(&server)
1371 .await;
1372 let site = site_with(
1373 &server,
1374 vec![
1375 Signal::StatusFound { codes: vec![200] },
1376 Signal::BodyAbsent {
1377 text: "Profile not found".into(),
1378 },
1379 ],
1380 );
1381 let outcome = build_client().check(&site, &user()).await;
1382 assert_eq!(outcome.kind, MatchKind::NotFound);
1383 }
1384
1385 #[tokio::test]
1386 async fn network_failure_yields_uncertain() {
1387 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1388 let port = listener.local_addr().unwrap().port();
1389 drop(listener);
1390
1391 let site = Site {
1392 name: "Dead".into(),
1393 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1394 signals: vec![Signal::StatusFound { codes: vec![200] }],
1395 known_present: None,
1396 known_absent: None,
1397 extract: Vec::new(),
1398 tags: Vec::new(),
1399 request_headers: std::collections::BTreeMap::new(),
1400 regex_check: None,
1401 engine: None,
1402 strip_bad_char: None,
1403 request_method: crate::site::HttpMethod::Get,
1404 request_body: None,
1405 protection: Vec::new(),
1406 disabled: false,
1407 disabled_reason: None,
1408 source: None,
1409 popularity: None,
1410 access: crate::AccessPolicy::default(),
1411 };
1412 let client = Client::builder()
1413 .timeout(Duration::from_millis(500))
1414 .connect_timeout(Duration::from_millis(500))
1415 .max_retries(0)
1416 .build()
1417 .unwrap();
1418 let outcome = client.check(&site, &user()).await;
1419 assert_eq!(outcome.kind, MatchKind::Uncertain);
1420 assert!(outcome.reason.is_some());
1421 }
1422
1423 #[tokio::test]
1424 async fn throttle_spaces_consecutive_calls_to_same_host() {
1425 let server = MockServer::start().await;
1426 Mock::given(any())
1427 .and(path("/alice"))
1428 .respond_with(ResponseTemplate::new(200))
1429 .mount(&server)
1430 .await;
1431 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1432 let client = Client::builder()
1437 .timeout(Duration::from_secs(2))
1438 .min_request_interval(Duration::from_millis(300))
1439 .build()
1440 .unwrap();
1441
1442 client.check(&site, &user()).await;
1443 let started = Instant::now();
1444 client.check(&site, &user()).await;
1445 let elapsed = started.elapsed();
1446 assert!(
1447 elapsed >= Duration::from_millis(200),
1448 "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1449 );
1450 }
1451
1452 #[tokio::test]
1453 async fn builder_overrides_user_agent() {
1454 let server = MockServer::start().await;
1455 Mock::given(any())
1456 .and(path("/alice"))
1457 .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1458 .respond_with(ResponseTemplate::new(200))
1459 .mount(&server)
1460 .await;
1461 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1462 let client = Client::builder()
1463 .user_agent("adler-test/1.0")
1464 .build()
1465 .unwrap();
1466 let outcome = client.check(&site, &user()).await;
1467 assert_eq!(outcome.kind, MatchKind::Found);
1468 }
1469
1470 #[tokio::test]
1471 async fn rate_limit_429_yields_uncertain_with_note() {
1472 let server = MockServer::start().await;
1473 Mock::given(any())
1474 .and(path("/alice"))
1475 .respond_with(ResponseTemplate::new(429))
1476 .mount(&server)
1477 .await;
1478 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1479 let outcome = build_client().check(&site, &user()).await;
1480 assert_eq!(outcome.kind, MatchKind::Uncertain);
1481 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1482 }
1483
1484 #[tokio::test]
1485 async fn cloudflare_server_header_yields_uncertain() {
1486 let server = MockServer::start().await;
1487 Mock::given(any())
1488 .and(path("/alice"))
1489 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1490 .mount(&server)
1491 .await;
1492 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1493 let outcome = build_client().check(&site, &user()).await;
1494 assert_eq!(outcome.kind, MatchKind::Uncertain);
1495 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1496 }
1497
1498 #[tokio::test]
1499 async fn cloudflare_interstitial_in_body_yields_uncertain() {
1500 let server = MockServer::start().await;
1503 Mock::given(any())
1504 .and(path("/alice"))
1505 .respond_with(
1506 ResponseTemplate::new(200)
1507 .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1508 )
1509 .mount(&server)
1510 .await;
1511 let site = site_with(
1512 &server,
1513 vec![Signal::BodyAbsent {
1514 text: "Profile not found".into(),
1515 }],
1516 );
1517 let outcome = build_client().check(&site, &user()).await;
1518 assert_eq!(outcome.kind, MatchKind::Uncertain);
1519 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1520 }
1521
1522 #[tokio::test]
1523 async fn ban_detection_does_not_fire_on_legitimate_403() {
1524 let server = MockServer::start().await;
1525 Mock::given(any())
1526 .and(path("/alice"))
1527 .respond_with(ResponseTemplate::new(403))
1528 .mount(&server)
1529 .await;
1530 let site = site_with(
1531 &server,
1532 vec![
1533 Signal::StatusFound { codes: vec![200] },
1534 Signal::StatusNotFound { codes: vec![403] },
1535 ],
1536 );
1537 let outcome = build_client().check(&site, &user()).await;
1538 assert_eq!(outcome.kind, MatchKind::NotFound);
1540 assert!(outcome.reason.is_none());
1541 }
1542
1543 #[tokio::test]
1544 async fn retry_recovers_after_transient_429() {
1545 let server = MockServer::start().await;
1546 Mock::given(any())
1548 .and(path("/alice"))
1549 .respond_with(ResponseTemplate::new(429))
1550 .up_to_n_times(1)
1551 .mount(&server)
1552 .await;
1553 Mock::given(any())
1554 .and(path("/alice"))
1555 .respond_with(ResponseTemplate::new(200))
1556 .mount(&server)
1557 .await;
1558 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1559 let client = Client::builder()
1560 .timeout(Duration::from_secs(2))
1561 .min_request_interval(Duration::ZERO)
1562 .max_retries(2)
1563 .base_backoff_delay(Duration::from_millis(20))
1564 .max_backoff_delay(Duration::from_millis(100))
1565 .build()
1566 .unwrap();
1567 let outcome = client.check(&site, &user()).await;
1568 assert_eq!(outcome.kind, MatchKind::Found);
1569 assert!(outcome.reason.is_none());
1570 }
1571
1572 #[tokio::test]
1573 async fn retry_exhausts_and_returns_uncertain() {
1574 let server = MockServer::start().await;
1575 Mock::given(any())
1576 .and(path("/alice"))
1577 .respond_with(ResponseTemplate::new(429))
1578 .mount(&server)
1579 .await;
1580 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1581 let client = Client::builder()
1582 .timeout(Duration::from_secs(2))
1583 .min_request_interval(Duration::ZERO)
1584 .max_retries(2)
1585 .base_backoff_delay(Duration::from_millis(10))
1586 .max_backoff_delay(Duration::from_millis(50))
1587 .build()
1588 .unwrap();
1589 let outcome = client.check(&site, &user()).await;
1590 assert_eq!(outcome.kind, MatchKind::Uncertain);
1591 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1592 }
1593
1594 #[tokio::test]
1595 async fn retry_does_not_fire_on_network_error() {
1596 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1600 let port = listener.local_addr().unwrap().port();
1601 drop(listener);
1602 let site = Site {
1603 name: "Dead".into(),
1604 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1605 signals: vec![Signal::StatusFound { codes: vec![200] }],
1606 known_present: None,
1607 known_absent: None,
1608 extract: Vec::new(),
1609 tags: Vec::new(),
1610 request_headers: std::collections::BTreeMap::new(),
1611 regex_check: None,
1612 engine: None,
1613 strip_bad_char: None,
1614 request_method: crate::site::HttpMethod::Get,
1615 request_body: None,
1616 protection: Vec::new(),
1617 disabled: false,
1618 disabled_reason: None,
1619 source: None,
1620 popularity: None,
1621 access: crate::AccessPolicy::default(),
1622 };
1623 let client = Client::builder()
1624 .timeout(Duration::from_millis(500))
1625 .connect_timeout(Duration::from_millis(500))
1626 .min_request_interval(Duration::ZERO)
1627 .max_retries(3)
1628 .base_backoff_delay(Duration::from_secs(60))
1629 .build()
1630 .unwrap();
1631 let started = Instant::now();
1632 let outcome = client.check(&site, &user()).await;
1633 assert!(started.elapsed() < Duration::from_secs(5));
1636 assert_eq!(outcome.kind, MatchKind::Uncertain);
1637 assert!(
1638 matches!(outcome.reason, Some(UncertainReason::Network(_))),
1639 "got {:?}",
1640 outcome.reason,
1641 );
1642 }
1643
1644 #[tokio::test]
1645 async fn rotates_user_agent_per_request() {
1646 let server = MockServer::start().await;
1650 Mock::given(any())
1651 .and(path("/alice"))
1652 .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1653 .respond_with(ResponseTemplate::new(200))
1654 .mount(&server)
1655 .await;
1656 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1657 let client = Client::builder()
1658 .min_request_interval(Duration::ZERO)
1659 .max_retries(0)
1660 .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1661 .build()
1662 .unwrap();
1663 let outcome = client.check(&site, &user()).await;
1664 assert_eq!(outcome.kind, MatchKind::Found);
1665 }
1666
1667 #[test]
1668 fn invalid_proxy_url_fails_build() {
1669 let err = Client::builder().proxy("not a url").build().unwrap_err();
1670 assert!(matches!(err, Error::HttpSetup { .. }));
1671 }
1672
1673 #[test]
1674 fn schemeless_proxy_is_rejected_up_front() {
1675 let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1677 let Error::HttpSetup { message } = err else {
1678 panic!("expected HttpSetup, got {err:?}");
1679 };
1680 assert!(message.contains("must start with"), "{message}");
1681 }
1682
1683 #[test]
1684 fn socks5_proxy_scheme_is_accepted() {
1685 assert!(
1687 Client::builder()
1688 .proxy("socks5://127.0.0.1:9050")
1689 .build()
1690 .is_ok()
1691 );
1692 }
1693
1694 #[tokio::test]
1695 async fn global_rps_cap_spaces_requests_across_hosts() {
1696 let server = MockServer::start().await;
1699 Mock::given(any())
1700 .respond_with(ResponseTemplate::new(200))
1701 .mount(&server)
1702 .await;
1703 let site_a = Site {
1704 name: "A".into(),
1705 url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1706 signals: vec![Signal::StatusFound { codes: vec![200] }],
1707 known_present: None,
1708 known_absent: None,
1709 extract: Vec::new(),
1710 tags: Vec::new(),
1711 request_headers: std::collections::BTreeMap::new(),
1712 regex_check: None,
1713 engine: None,
1714 strip_bad_char: None,
1715 request_method: crate::site::HttpMethod::Get,
1716 request_body: None,
1717 protection: Vec::new(),
1718 disabled: false,
1719 disabled_reason: None,
1720 source: None,
1721 popularity: None,
1722 access: crate::AccessPolicy::default(),
1723 };
1724 let site_b = Site {
1725 name: "B".into(),
1726 url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1727 signals: vec![Signal::StatusFound { codes: vec![200] }],
1728 known_present: None,
1729 known_absent: None,
1730 extract: Vec::new(),
1731 tags: Vec::new(),
1732 request_headers: std::collections::BTreeMap::new(),
1733 regex_check: None,
1734 engine: None,
1735 strip_bad_char: None,
1736 request_method: crate::site::HttpMethod::Get,
1737 request_body: None,
1738 protection: Vec::new(),
1739 disabled: false,
1740 disabled_reason: None,
1741 source: None,
1742 popularity: None,
1743 access: crate::AccessPolicy::default(),
1744 };
1745 let client = Client::builder()
1750 .min_request_interval(Duration::ZERO)
1751 .max_retries(0)
1752 .max_rps(std::num::NonZeroU32::new(2).unwrap())
1753 .build()
1754 .unwrap();
1755 client.check(&site_a, &user()).await;
1758 let started = Instant::now();
1759 client.check(&site_b, &user()).await;
1760 assert!(
1761 started.elapsed() >= Duration::from_millis(350),
1762 "global cap should space cross-host requests, got {:?}",
1763 started.elapsed(),
1764 );
1765 }
1766
1767 #[tokio::test]
1768 async fn respect_robots_skips_disallowed_paths() {
1769 let server = MockServer::start().await;
1770 Mock::given(any())
1771 .and(path("/robots.txt"))
1772 .respond_with(
1773 ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1774 )
1775 .mount(&server)
1776 .await;
1777 Mock::given(any())
1778 .and(path("/no/alice"))
1779 .respond_with(ResponseTemplate::new(200))
1780 .mount(&server)
1781 .await;
1782 Mock::given(any())
1783 .and(path("/yes/alice"))
1784 .respond_with(ResponseTemplate::new(200))
1785 .mount(&server)
1786 .await;
1787 let client = Client::builder()
1788 .min_request_interval(Duration::ZERO)
1789 .max_retries(0)
1790 .respect_robots(true)
1791 .build()
1792 .unwrap();
1793
1794 let disallowed = Site {
1795 name: "No".into(),
1796 url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1797 signals: vec![Signal::StatusFound { codes: vec![200] }],
1798 known_present: None,
1799 known_absent: None,
1800 extract: Vec::new(),
1801 tags: Vec::new(),
1802 request_headers: std::collections::BTreeMap::new(),
1803 regex_check: None,
1804 engine: None,
1805 strip_bad_char: None,
1806 request_method: crate::site::HttpMethod::Get,
1807 request_body: None,
1808 protection: Vec::new(),
1809 disabled: false,
1810 disabled_reason: None,
1811 source: None,
1812 popularity: None,
1813 access: crate::AccessPolicy::default(),
1814 };
1815 let allowed = Site {
1816 name: "Yes".into(),
1817 url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1818 signals: vec![Signal::StatusFound { codes: vec![200] }],
1819 known_present: None,
1820 known_absent: None,
1821 extract: Vec::new(),
1822 tags: Vec::new(),
1823 request_headers: std::collections::BTreeMap::new(),
1824 regex_check: None,
1825 engine: None,
1826 strip_bad_char: None,
1827 request_method: crate::site::HttpMethod::Get,
1828 request_body: None,
1829 protection: Vec::new(),
1830 disabled: false,
1831 disabled_reason: None,
1832 source: None,
1833 popularity: None,
1834 access: crate::AccessPolicy::default(),
1835 };
1836
1837 let no = client.check(&disallowed, &user()).await;
1838 assert_eq!(no.kind, MatchKind::Uncertain);
1839 assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1840
1841 let yes = client.check(&allowed, &user()).await;
1842 assert_eq!(yes.kind, MatchKind::Found);
1843 }
1844
1845 #[tokio::test]
1846 async fn body_read_skipped_when_no_body_signal_needed() {
1847 let server = MockServer::start().await;
1850 Mock::given(any())
1851 .and(path("/alice"))
1852 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1853 .mount(&server)
1854 .await;
1855 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1856 let outcome = build_client().check(&site, &user()).await;
1857 assert_eq!(outcome.kind, MatchKind::Found);
1858 }
1859
1860 #[derive(Debug)]
1866 struct RecordingBackend {
1867 page: RenderedPage,
1868 calls: std::sync::atomic::AtomicUsize,
1869 }
1870
1871 impl RecordingBackend {
1872 fn with_page(page: RenderedPage) -> Self {
1873 Self {
1874 page,
1875 calls: std::sync::atomic::AtomicUsize::new(0),
1876 }
1877 }
1878 fn call_count(&self) -> usize {
1879 self.calls.load(std::sync::atomic::Ordering::SeqCst)
1880 }
1881 }
1882
1883 #[async_trait::async_trait]
1884 impl BrowserBackend for RecordingBackend {
1885 async fn fetch(
1886 &self,
1887 _url: &url::Url,
1888 _headers: &std::collections::BTreeMap<String, String>,
1889 _timeout: Duration,
1890 ) -> Result<RenderedPage> {
1891 self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1892 Ok(self.page.clone())
1893 }
1894 }
1895
1896 fn site_bot_protected(server: &MockServer) -> Site {
1897 let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1898 s.tags = vec!["bot-protected".into()];
1899 s
1900 }
1901
1902 #[tokio::test]
1903 async fn browser_routes_bot_protected_sites() {
1904 let server = MockServer::start().await;
1907 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1908 status: 200,
1909 final_url: url::Url::parse("https://example.com/alice").unwrap(),
1910 body: "<html></html>".into(),
1911 elapsed_ms: 42,
1912 }));
1913 let client = Client::builder()
1914 .min_request_interval(Duration::ZERO)
1915 .max_retries(0)
1916 .browser(backend.clone())
1917 .build()
1918 .unwrap();
1919 let outcome = client.check(&site_bot_protected(&server), &user()).await;
1920 assert_eq!(outcome.kind, MatchKind::Found);
1921 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1922 }
1923
1924 #[tokio::test]
1925 async fn non_bot_protected_sites_skip_browser() {
1926 let server = MockServer::start().await;
1927 Mock::given(any())
1928 .and(path("/alice"))
1929 .respond_with(ResponseTemplate::new(200))
1930 .mount(&server)
1931 .await;
1932 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1933 status: 500, final_url: url::Url::parse("https://x/").unwrap(),
1935 body: String::new(),
1936 elapsed_ms: 0,
1937 }));
1938 let client = Client::builder()
1939 .min_request_interval(Duration::ZERO)
1940 .max_retries(0)
1941 .browser(backend.clone())
1942 .build()
1943 .unwrap();
1944 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1946 let outcome = client.check(&site, &user()).await;
1947 assert_eq!(outcome.kind, MatchKind::Found);
1948 assert_eq!(backend.call_count(), 0, "browser must not be touched");
1949 }
1950
1951 #[tokio::test]
1952 async fn browser_budget_exhaust_yields_uncertain() {
1953 let server = MockServer::start().await;
1954 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1955 status: 200,
1956 final_url: url::Url::parse("https://x/").unwrap(),
1957 body: String::new(),
1958 elapsed_ms: 0,
1959 }));
1960 let client = Client::builder()
1961 .min_request_interval(Duration::ZERO)
1962 .max_retries(0)
1963 .browser(backend.clone())
1964 .browser_budget(1)
1965 .build()
1966 .unwrap();
1967 let site = site_bot_protected(&server);
1968 let first = client.check(&site, &user()).await;
1970 assert_eq!(first.kind, MatchKind::Found);
1971 let second = client.check(&site, &user()).await;
1973 assert_eq!(second.kind, MatchKind::Uncertain);
1974 assert!(matches!(
1975 second.reason,
1976 Some(UncertainReason::BrowserBudget)
1977 ));
1978 assert_eq!(
1979 backend.call_count(),
1980 1,
1981 "second call must not invoke backend"
1982 );
1983 }
1984
1985 #[tokio::test]
1986 async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1987 struct FailingBackend;
1988 #[async_trait::async_trait]
1989 impl BrowserBackend for FailingBackend {
1990 async fn fetch(
1991 &self,
1992 _url: &url::Url,
1993 _headers: &std::collections::BTreeMap<String, String>,
1994 _timeout: Duration,
1995 ) -> Result<RenderedPage> {
1996 Err(Error::BrowserSetup {
1997 message: "simulated crash".into(),
1998 })
1999 }
2000 }
2001 impl std::fmt::Debug for FailingBackend {
2002 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2003 f.write_str("FailingBackend")
2004 }
2005 }
2006
2007 let server = MockServer::start().await;
2008 let client = Client::builder()
2009 .min_request_interval(Duration::ZERO)
2010 .max_retries(0)
2011 .browser(Arc::new(FailingBackend))
2012 .build()
2013 .unwrap();
2014 let outcome = client.check(&site_bot_protected(&server), &user()).await;
2015 assert_eq!(outcome.kind, MatchKind::Uncertain);
2016 match outcome.reason {
2017 Some(UncertainReason::BrowserFailed(msg)) => {
2018 assert!(msg.contains("simulated crash"), "got: {msg}");
2019 }
2020 other => panic!("expected BrowserFailed, got {other:?}"),
2021 }
2022 }
2023
2024 #[tokio::test]
2025 async fn status_only_site_uses_head_request() {
2026 let server = MockServer::start().await;
2030 Mock::given(method("HEAD"))
2031 .and(path("/alice"))
2032 .respond_with(ResponseTemplate::new(200))
2033 .mount(&server)
2034 .await;
2035 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2036 let outcome = build_client().check(&site, &user()).await;
2037 assert_eq!(outcome.kind, MatchKind::Found);
2038 let recvd = server.received_requests().await.unwrap_or_default();
2039 assert_eq!(recvd.len(), 1);
2040 assert_eq!(recvd[0].method.as_str(), "HEAD");
2041 }
2042
2043 #[tokio::test]
2044 async fn body_signal_site_uses_get_request() {
2045 let server = MockServer::start().await;
2048 Mock::given(any())
2049 .and(path("/alice"))
2050 .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
2051 .mount(&server)
2052 .await;
2053 let site = site_with(
2054 &server,
2055 vec![Signal::BodyPresent {
2056 text: "hello".into(),
2057 }],
2058 );
2059 let outcome = build_client().check(&site, &user()).await;
2060 assert_eq!(outcome.kind, MatchKind::Found);
2061 let recvd = server.received_requests().await.unwrap_or_default();
2062 assert_eq!(recvd[0].method.as_str(), "GET");
2063 }
2064
2065 #[tokio::test]
2066 async fn protection_field_routes_through_browser_like_bot_protected_tag() {
2067 let server = MockServer::start().await;
2072 Mock::given(any())
2073 .respond_with(ResponseTemplate::new(200))
2074 .mount(&server)
2075 .await;
2076 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2077 site.protection = vec![crate::site::ProtectionKind::Cloudflare];
2078 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2080 status: 200,
2081 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2082 body: String::new(),
2083 elapsed_ms: 0,
2084 }));
2085 let client = Client::builder()
2086 .min_request_interval(Duration::ZERO)
2087 .max_retries(0)
2088 .browser(backend)
2089 .build()
2090 .unwrap();
2091 let outcome = client.check(&site, &user()).await;
2092 assert_eq!(outcome.kind, MatchKind::Found);
2095 let recvd = server.received_requests().await.unwrap_or_default();
2097 assert_eq!(
2098 recvd.len(),
2099 0,
2100 "structured protection must skip the raw HTTP path"
2101 );
2102 }
2103
2104 #[tokio::test]
2105 async fn post_method_sends_body_with_username_substituted() {
2106 let server = MockServer::start().await;
2110 Mock::given(method("POST"))
2111 .and(path("/api"))
2112 .respond_with(ResponseTemplate::new(200))
2113 .mount(&server)
2114 .await;
2115 let site = Site {
2120 name: "ApiPost".into(),
2121 url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
2122 signals: vec![Signal::StatusFound { codes: vec![200] }],
2123 known_present: None,
2124 known_absent: None,
2125 extract: Vec::new(),
2126 tags: Vec::new(),
2127 request_headers: std::collections::BTreeMap::new(),
2128 regex_check: None,
2129 engine: None,
2130 strip_bad_char: None,
2131 request_method: HttpMethod::Post,
2132 request_body: Some(r#"{"name":"{username}"}"#.into()),
2133 protection: Vec::new(),
2134 disabled: false,
2135 disabled_reason: None,
2136 source: None,
2137 popularity: None,
2138 access: crate::AccessPolicy::default(),
2139 };
2140 let outcome = build_client().check(&site, &user()).await;
2141 assert_eq!(outcome.kind, MatchKind::Found);
2142 let recvd = server.received_requests().await.unwrap_or_default();
2143 assert_eq!(recvd.len(), 1);
2144 assert_eq!(recvd[0].method.as_str(), "POST");
2145 let body = String::from_utf8_lossy(&recvd[0].body).to_string();
2146 assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
2147 }
2148
2149 #[tokio::test]
2150 async fn head_405_falls_back_to_get() {
2151 let server = MockServer::start().await;
2154 Mock::given(method("HEAD"))
2155 .and(path("/alice"))
2156 .respond_with(ResponseTemplate::new(405))
2157 .mount(&server)
2158 .await;
2159 Mock::given(any())
2160 .and(path("/alice"))
2161 .respond_with(ResponseTemplate::new(200))
2162 .mount(&server)
2163 .await;
2164 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2165 let outcome = build_client().check(&site, &user()).await;
2166 assert_eq!(outcome.kind, MatchKind::Found);
2167 let recvd = server.received_requests().await.unwrap_or_default();
2168 assert_eq!(recvd.len(), 2);
2169 assert_eq!(recvd[0].method.as_str(), "HEAD");
2170 assert_eq!(recvd[1].method.as_str(), "GET");
2171 }
2172
2173 async fn cloudflare_503_server() -> MockServer {
2182 let server = MockServer::start().await;
2183 Mock::given(any())
2184 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
2185 .mount(&server)
2186 .await;
2187 server
2188 }
2189
2190 #[tokio::test]
2191 async fn http_success_stamps_http_transport_no_escalations() {
2192 let server = MockServer::start().await;
2193 Mock::given(any())
2194 .respond_with(ResponseTemplate::new(200))
2195 .mount(&server)
2196 .await;
2197 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2198 let outcome = build_client().check(&site, &user()).await;
2199 assert_eq!(outcome.kind, MatchKind::Found);
2200 assert_eq!(
2201 outcome.transport,
2202 Some(crate::escalation::TransportTier::Http),
2203 "successful HTTP probe must stamp Http transport"
2204 );
2205 assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
2206 }
2207
2208 #[tokio::test]
2209 async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
2210 let server = cloudflare_503_server().await;
2211 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2213 status: 200,
2214 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2215 body: String::new(),
2216 elapsed_ms: 5,
2217 }));
2218 let client = Client::builder()
2219 .min_request_interval(Duration::ZERO)
2220 .max_retries(0)
2221 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2222 .build()
2223 .unwrap();
2224 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2227 let outcome = client.check(&site, &user()).await;
2228 assert_eq!(
2229 outcome.kind,
2230 MatchKind::Found,
2231 "escalation should flip CF challenge to Found via browser (reason {:?})",
2232 outcome.reason
2233 );
2234 assert_eq!(
2235 outcome.transport,
2236 Some(crate::escalation::TransportTier::Browser),
2237 "escalated outcome must be stamped Browser"
2238 );
2239 assert_eq!(
2240 outcome.escalations, 1,
2241 "exactly one escalation should have fired"
2242 );
2243 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
2244 }
2245
2246 #[tokio::test]
2247 async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
2248 let server = cloudflare_503_server().await;
2249 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2250 status: 200,
2251 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2252 body: String::new(),
2253 elapsed_ms: 0,
2254 }));
2255 let client = Client::builder()
2256 .min_request_interval(Duration::ZERO)
2257 .max_retries(0)
2258 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2259 .disable_escalation()
2260 .build()
2261 .unwrap();
2262 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2263 let outcome = client.check(&site, &user()).await;
2264 assert_eq!(outcome.kind, MatchKind::Uncertain);
2265 assert!(matches!(
2266 outcome.reason,
2267 Some(UncertainReason::CloudflareChallenge)
2268 ));
2269 assert_eq!(
2270 outcome.transport,
2271 Some(crate::escalation::TransportTier::Http),
2272 "primary transport must still be stamped"
2273 );
2274 assert_eq!(outcome.escalations, 0);
2275 assert_eq!(
2276 backend.call_count(),
2277 0,
2278 "browser must not be touched when --no-escalation"
2279 );
2280 }
2281
2282 #[tokio::test]
2283 async fn escalation_budget_zero_keeps_browser_untouched() {
2284 let server = cloudflare_503_server().await;
2285 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2286 status: 200,
2287 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2288 body: String::new(),
2289 elapsed_ms: 0,
2290 }));
2291 let client = Client::builder()
2292 .min_request_interval(Duration::ZERO)
2293 .max_retries(0)
2294 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2295 .escalation_budget(0)
2296 .build()
2297 .unwrap();
2298 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2299 let outcome = client.check(&site, &user()).await;
2300 assert_eq!(outcome.kind, MatchKind::Uncertain);
2301 assert!(matches!(
2302 outcome.reason,
2303 Some(UncertainReason::CloudflareChallenge)
2304 ));
2305 assert_eq!(outcome.escalations, 0);
2306 assert_eq!(
2307 backend.call_count(),
2308 0,
2309 "zero budget must deny every escalation"
2310 );
2311 }
2312
2313 #[tokio::test]
2314 async fn escalation_consumes_budget_then_stops() {
2315 let server = cloudflare_503_server().await;
2316 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2317 status: 200,
2318 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2319 body: String::new(),
2320 elapsed_ms: 0,
2321 }));
2322 let client = Client::builder()
2323 .min_request_interval(Duration::ZERO)
2324 .max_retries(0)
2325 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2326 .escalation_budget(1)
2327 .build()
2328 .unwrap();
2329 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2330 let first = client.check(&site, &user()).await;
2332 assert_eq!(first.kind, MatchKind::Found);
2333 assert_eq!(first.escalations, 1);
2334 let second = client.check(&site, &user()).await;
2336 assert_eq!(second.kind, MatchKind::Uncertain);
2337 assert!(matches!(
2338 second.reason,
2339 Some(UncertainReason::CloudflareChallenge)
2340 ));
2341 assert_eq!(second.escalations, 0);
2342 assert_eq!(backend.call_count(), 1, "browser called exactly once total");
2343 }
2344}