1use std::borrow::Cow;
11use std::collections::BTreeMap;
12use std::fmt;
13use std::num::NonZeroU32;
14use std::sync::Arc;
15use std::time::{Duration, Instant};
16
17use reqwest::redirect;
18
19use crate::access::{EgressChoice, EgressPool, EgressSpec, SessionStore};
20use crate::browser::{BrowserBackend, BrowserBudget};
21use crate::check::{CheckOutcome, MatchKind, UncertainReason};
22use crate::error::{Error, Result};
23use crate::retry::{self, RetryPolicy};
24use crate::robots::RobotsCache;
25use crate::site::{HttpMethod, Probe, Signal, SignalVerdict, Site, aggregate};
26use crate::throttle::HostThrottle;
27#[cfg(feature = "impersonate")]
28use crate::transport::ImpersonateFetcher;
29use crate::transport::{
30 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
31};
32use crate::username::Username;
33
34const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
35const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
36const DEFAULT_REDIRECT_LIMIT: usize = 8;
37const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
38const GLOBAL_THROTTLE_KEY: &str = "*global*";
40
41#[derive(Clone)]
49pub struct Client {
50 http: Arc<HttpFetcher>,
51 egress: Arc<EgressPool>,
54 sessions: Arc<SessionStore>,
57 throttle: HostThrottle,
58 global_throttle: Option<HostThrottle>,
60 retry: RetryPolicy,
61 user_agents: Arc<[String]>,
64 enrich: bool,
66 robots: Option<RobotsCache>,
68 browser: Option<Arc<dyn BrowserBackend>>,
71 #[cfg(feature = "impersonate")]
75 impersonate: Option<Arc<ImpersonateFetcher>>,
76 browser_budget: Arc<BrowserBudget>,
79 escalation_budget: Arc<crate::escalation::EscalationBudget>,
85 escalation_enabled: bool,
89}
90
91impl Client {
92 pub fn builder() -> ClientBuilder {
94 ClientBuilder::default()
95 }
96
97 #[must_use]
102 pub fn egress_summary(&self) -> Vec<crate::access::EgressSummary> {
103 self.egress.summary()
104 }
105
106 #[must_use]
110 pub fn session_names(&self) -> Vec<String> {
111 self.sessions.names()
112 }
113
114 #[must_use]
118 pub fn egress_names(&self) -> Vec<String> {
119 self.egress.names()
120 }
121
122 #[must_use]
135 pub fn with_egress_subset(&self, names: &[String]) -> Self {
136 Self {
137 http: Arc::clone(&self.http),
138 egress: Arc::new(self.egress.subset(names)),
139 sessions: Arc::clone(&self.sessions),
140 throttle: self.throttle.clone(),
141 global_throttle: self.global_throttle.clone(),
142 retry: self.retry.clone(),
143 user_agents: Arc::clone(&self.user_agents),
144 enrich: self.enrich,
145 robots: self.robots.clone(),
146 browser: self.browser.clone(),
147 #[cfg(feature = "impersonate")]
148 impersonate: self.impersonate.clone(),
149 browser_budget: Arc::clone(&self.browser_budget),
150 escalation_budget: Arc::clone(&self.escalation_budget),
151 escalation_enabled: self.escalation_enabled,
152 }
153 }
154
155 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
169 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
170 let mut attempt: u32 = 0;
171 loop {
172 let outcome = self.probe_once(site, username).await;
173 if !retry::should_retry(&outcome, attempt, &self.retry) {
174 return outcome;
175 }
176 let delay = retry::backoff_delay(attempt, &self.retry);
177 tracing::info!(
178 site = %site.name,
179 attempt = attempt + 1,
180 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
181 ?delay,
182 "transient ban, retrying",
183 );
184 tokio::time::sleep(delay).await;
185 attempt += 1;
186 }
187 }
188
189 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
198 let host = host_of(url);
199 if let Some(global) = &self.global_throttle {
200 global.wait(GLOBAL_THROTTLE_KEY).await;
201 }
202 self.throttle.wait(&host).await;
203 let mut request = self.http.client().get(url);
204 if let Some(ua) = self.pick_user_agent() {
205 request = request.header(reqwest::header::USER_AGENT, ua);
206 }
207 let response = request.send().await.ok()?;
208 let status = response.status().as_u16();
209 let final_url = response.url().to_string();
210 let body = response.text().await.unwrap_or_default();
211 Some(RawResponse {
212 status,
213 final_url,
214 body,
215 })
216 }
217
218 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
229 if let Some(backend) = self.browser.as_deref() {
230 let has_tag = site
231 .tags
232 .iter()
233 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
234 if has_tag || !site.protection.is_empty() {
235 let parsed = url::Url::parse(url).ok()?;
236 match backend
237 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
238 .await
239 {
240 Ok(page) => {
241 return Some(RawResponse {
242 status: page.status,
243 final_url: page.final_url.to_string(),
244 body: page.body,
245 });
246 }
247 Err(err) => {
248 tracing::warn!(
249 site = %site.name, %url, error = %err,
250 "browser fetch failed in doctor; falling back to raw HTTP",
251 );
252 }
253 }
254 }
255 }
256 self.fetch(url).await
257 }
258
259 fn pick_user_agent(&self) -> Option<&str> {
262 match self.user_agents.len() {
263 0 => None,
264 1 => Some(&self.user_agents[0]),
265 n => Some(&self.user_agents[fastrand::usize(0..n)]),
266 }
267 }
268
269 #[allow(clippy::too_many_lines)]
272 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
273 let url = site.url_for(username);
274
275 if let Some(pat) = &site.regex_check {
285 if let Ok(re) = regex::Regex::new(pat) {
286 if !re.is_match(username.as_str()) {
287 return uncertain(
288 &site.name,
289 url,
290 Instant::now(),
291 UncertainReason::UsernameNotAllowed,
292 );
293 }
294 }
295 }
296
297 let session_headers: Cow<'_, BTreeMap<String, String>> = match &site.access.session {
304 None => Cow::Borrowed(&site.request_headers),
305 Some(name) => match self.sessions.get(name) {
306 Some(session) => Cow::Owned(session.apply(&site.request_headers)),
307 None => {
308 return uncertain(
309 &site.name,
310 url,
311 Instant::now(),
312 UncertainReason::SessionRequired,
313 );
314 }
315 },
316 };
317 let headers: &BTreeMap<String, String> = &session_headers;
318
319 if let Some(backend) = &self.browser {
326 let has_tag = site
327 .tags
328 .iter()
329 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
330 if has_tag || !site.protection.is_empty() {
331 if self.browser_budget.try_consume() {
332 let started = Instant::now();
333 let req = FetchRequest {
334 method: site.request_method,
335 url: &url,
336 body: None,
337 user_agent: None,
338 headers,
339 want_body: true,
340 };
341 let fetcher = BrowserFetcher::new(Arc::clone(backend));
342 let mut outcome = match fetcher.fetch(&req).await {
343 Ok(resp) => self.finish(site, url, started, &resp),
344 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
345 };
346 outcome.transport = Some(crate::escalation::TransportTier::Browser);
347 return outcome;
348 }
349 tracing::warn!(site = %site.name, "browser budget exhausted");
350 let mut outcome = uncertain(
351 &site.name,
352 url,
353 Instant::now(),
354 UncertainReason::BrowserBudget,
355 );
356 outcome.transport = Some(crate::escalation::TransportTier::Browser);
357 return outcome;
358 }
359 }
360
361 #[cfg(feature = "impersonate")]
368 if let Some(fetcher) = &self.impersonate {
369 let pure_tls = site.protection.len() == 1
370 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
371 && !site
372 .tags
373 .iter()
374 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
375 if pure_tls {
376 let started = Instant::now();
377 let req = FetchRequest {
378 method: site.request_method,
379 url: &url,
380 body: None,
381 user_agent: self.pick_user_agent(),
382 headers,
383 want_body: true,
384 };
385 let mut primary = match fetcher.fetch(&req).await {
386 Ok(resp) => self.finish(site, url.clone(), started, &resp),
387 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
388 };
389 primary.transport = Some(crate::escalation::TransportTier::Impersonate);
390 return self.maybe_escalate(site, &url, headers, primary).await;
391 }
392 }
393
394 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
401 EgressChoice::Default => Arc::clone(&self.http),
402 EgressChoice::Use(fetcher) => fetcher,
403 EgressChoice::Unavailable => {
404 return uncertain(
405 &site.name,
406 url,
407 Instant::now(),
408 UncertainReason::GeoUnavailable,
409 );
410 }
411 };
412
413 let host = host_of(&url);
414
415 if let Some(robots) = &self.robots {
417 if let Some((origin, path)) = origin_and_path(&url) {
418 if !robots.allowed(&origin, &path).await {
419 tracing::debug!(%url, "skipped by robots.txt");
420 return uncertain(
421 &site.name,
422 url,
423 Instant::now(),
424 UncertainReason::RobotsDisallowed,
425 );
426 }
427 }
428 }
429
430 if let Some(global) = &self.global_throttle {
432 global.wait(GLOBAL_THROTTLE_KEY).await;
433 }
434 self.throttle.wait(&host).await;
435 let started = Instant::now();
436 tracing::debug!(%url, %host, "probing");
437
438 let want_enrich = self.enrich && !site.extract.is_empty();
441 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
442
443 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
448 const USERNAME_PH: &str = "{username}";
449 site.request_body
450 .as_deref()
451 .map(|t| t.replace(USERNAME_PH, username.as_str()))
452 } else {
453 None
454 };
455
456 let req = FetchRequest {
457 method: site.request_method,
458 url: &url,
459 body: body_for_post.as_deref(),
460 user_agent: self.pick_user_agent(),
461 headers,
462 want_body: needs_body,
463 };
464 let mut primary = match egress.fetch(&req).await {
465 Ok(resp) => self.finish(site, url.clone(), started, &resp),
466 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
467 };
468 primary.transport = Some(crate::escalation::TransportTier::Http);
469 self.maybe_escalate(site, &url, headers, primary).await
470 }
471
472 async fn maybe_escalate(
477 &self,
478 site: &Site,
479 url: &str,
480 headers: &BTreeMap<String, String>,
481 primary: CheckOutcome,
482 ) -> CheckOutcome {
483 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
484 return primary;
485 }
486 let Some(reason) = &primary.reason else {
487 return primary;
488 };
489 if !crate::escalation::should_escalate(reason) {
490 return primary;
491 }
492 let Some(backend) = &self.browser else {
493 return primary;
494 };
495 if !self.escalation_budget.try_consume() {
496 tracing::debug!(site = %site.name, "escalation budget exhausted");
497 return primary;
498 }
499
500 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
501 let started = Instant::now();
502 let req = FetchRequest {
503 method: site.request_method,
504 url,
505 body: None,
506 user_agent: None,
507 headers,
508 want_body: true,
509 };
510 let fetcher = BrowserFetcher::new(Arc::clone(backend));
511 let mut escalated = match fetcher.fetch(&req).await {
512 Ok(resp) => self.finish(site, url.to_owned(), started, &resp),
513 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
514 };
515 escalated.transport = Some(crate::escalation::TransportTier::Browser);
516 escalated.escalations = 1;
517 escalated
518 }
519
520 fn finish(
524 &self,
525 site: &Site,
526 url: String,
527 started: Instant,
528 resp: &crate::transport::FetchResponse,
529 ) -> CheckOutcome {
530 let probe = Probe {
531 status: resp.status,
532 final_url: &resp.final_url,
533 body: &resp.body,
534 };
535 let votes: Vec<(&Signal, SignalVerdict)> = site
536 .signals
537 .iter()
538 .map(|s| (s, s.evaluate(&probe)))
539 .collect();
540 let kind = aggregate(votes.iter().map(|(_, v)| *v));
541 let mut result = outcome(&site.name, url, started, kind);
542 let winning = match kind {
544 MatchKind::Found => Some(SignalVerdict::Found),
545 MatchKind::NotFound => Some(SignalVerdict::NotFound),
546 MatchKind::Uncertain => None,
547 };
548 if let Some(want) = winning {
549 result.evidence = votes
550 .iter()
551 .filter(|(_, v)| *v == want)
552 .map(|(s, _)| s.describe_match(&probe))
553 .collect();
554 }
555 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
556 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
557 }
558 result
559 }
560}
561
562#[derive(Debug, Clone)]
564pub struct RawResponse {
565 pub status: u16,
567 pub final_url: String,
569 pub body: String,
571}
572
573#[derive(Clone)]
575#[must_use = "ClientBuilder does nothing until `.build()` is called"]
576#[allow(clippy::struct_excessive_bools)]
581pub struct ClientBuilder {
582 timeout: Duration,
583 connect_timeout: Duration,
584 user_agent: String,
585 follow_redirects: bool,
586 redirect_limit: usize,
587 min_request_interval: Duration,
588 max_rps: Option<NonZeroU32>,
589 retry: RetryPolicy,
590 proxy: Option<String>,
591 user_agents: Vec<String>,
592 enrich: bool,
593 respect_robots: bool,
594 browser: Option<Arc<dyn BrowserBackend>>,
595 browser_budget: usize,
596 egress: Vec<EgressSpec>,
597 sessions: SessionStore,
598 escalation_budget: usize,
599 escalation_enabled: bool,
600}
601
602impl Default for ClientBuilder {
603 fn default() -> Self {
604 Self {
605 timeout: DEFAULT_TIMEOUT,
606 connect_timeout: DEFAULT_CONNECT_TIMEOUT,
607 user_agent: default_user_agent(),
608 follow_redirects: true,
609 redirect_limit: DEFAULT_REDIRECT_LIMIT,
610 min_request_interval: DEFAULT_PER_HOST_INTERVAL,
611 max_rps: None,
612 retry: RetryPolicy::default(),
613 proxy: None,
614 user_agents: Vec::new(),
615 enrich: false,
616 respect_robots: false,
617 browser: None,
618 browser_budget: DEFAULT_BROWSER_BUDGET,
619 egress: Vec::new(),
620 sessions: SessionStore::new(),
621 escalation_budget: DEFAULT_ESCALATION_BUDGET,
622 escalation_enabled: true,
623 }
624 }
625}
626
627impl ClientBuilder {
628 pub fn timeout(mut self, timeout: Duration) -> Self {
630 self.timeout = timeout;
631 self
632 }
633
634 pub fn connect_timeout(mut self, timeout: Duration) -> Self {
636 self.connect_timeout = timeout;
637 self
638 }
639
640 pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
642 self.user_agent = user_agent.into();
643 self
644 }
645
646 pub fn follow_redirects(mut self, follow: bool) -> Self {
649 self.follow_redirects = follow;
650 self
651 }
652
653 pub fn min_request_interval(mut self, interval: Duration) -> Self {
659 self.min_request_interval = interval;
660 self
661 }
662
663 pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
668 self.max_rps = Some(rps);
669 self
670 }
671
672 pub fn max_retries(mut self, n: u32) -> Self {
675 self.retry.max_retries = n;
676 self
677 }
678
679 pub fn base_backoff_delay(mut self, d: Duration) -> Self {
682 self.retry.base_delay = d;
683 self
684 }
685
686 pub fn max_backoff_delay(mut self, d: Duration) -> Self {
688 self.retry.max_delay = d;
689 self
690 }
691
692 pub fn proxy(mut self, url: impl Into<String>) -> Self {
695 self.proxy = Some(url.into());
696 self
697 }
698
699 pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
703 self.user_agents = agents;
704 self
705 }
706
707 pub fn enrich(mut self, enrich: bool) -> Self {
710 self.enrich = enrich;
711 self
712 }
713
714 pub fn respect_robots(mut self, respect: bool) -> Self {
718 self.respect_robots = respect;
719 self
720 }
721
722 pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
726 self.browser = Some(backend);
727 self
728 }
729
730 pub const fn browser_budget(mut self, cap: usize) -> Self {
735 self.browser_budget = cap;
736 self
737 }
738
739 pub const fn escalation_budget(mut self, cap: usize) -> Self {
746 self.escalation_budget = cap;
747 self
748 }
749
750 pub const fn disable_escalation(mut self) -> Self {
755 self.escalation_enabled = false;
756 self
757 }
758
759 pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
764 self.egress = egress;
765 self
766 }
767
768 pub fn sessions(mut self, sessions: SessionStore) -> Self {
774 self.sessions = sessions;
775 self
776 }
777
778 pub fn build(self) -> Result<Client> {
780 let inner = build_reqwest(
781 &self.user_agent,
782 self.timeout,
783 self.connect_timeout,
784 self.follow_redirects,
785 self.redirect_limit,
786 self.proxy.as_deref(),
787 )?;
788
789 let mut egress_entries = Vec::with_capacity(self.egress.len());
793 for spec in &self.egress {
794 let client = build_reqwest(
795 &self.user_agent,
796 self.timeout,
797 self.connect_timeout,
798 self.follow_redirects,
799 self.redirect_limit,
800 Some(&spec.url),
801 )?;
802 egress_entries.push((
803 spec.name.clone(),
804 spec.country.clone(),
805 spec.kind,
806 Arc::new(HttpFetcher::new(client)),
807 ));
808 }
809
810 let global_throttle = self.max_rps.map(|rps| {
811 let interval = Duration::from_secs(1) / rps.get();
813 HostThrottle::new(interval)
814 });
815 let robots = self
816 .respect_robots
817 .then(|| RobotsCache::new(inner.clone(), "adler"));
818 #[cfg(feature = "impersonate")]
822 let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
823 Ok(Client {
824 http: Arc::new(HttpFetcher::new(inner)),
825 egress: Arc::new(EgressPool::new(egress_entries)),
826 sessions: Arc::new(self.sessions),
827 throttle: HostThrottle::new(self.min_request_interval),
828 global_throttle,
829 retry: self.retry,
830 user_agents: Arc::from(self.user_agents),
831 enrich: self.enrich,
832 robots,
833 browser: self.browser,
834 browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
835 escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
836 self.escalation_budget,
837 )),
838 escalation_enabled: self.escalation_enabled,
839 #[cfg(feature = "impersonate")]
840 impersonate,
841 })
842 }
843}
844
845fn build_reqwest(
849 user_agent: &str,
850 timeout: Duration,
851 connect_timeout: Duration,
852 follow_redirects: bool,
853 redirect_limit: usize,
854 proxy: Option<&str>,
855) -> Result<reqwest::Client> {
856 let redirect_policy = if follow_redirects {
857 redirect::Policy::limited(redirect_limit)
858 } else {
859 redirect::Policy::none()
860 };
861 let mut builder = reqwest::Client::builder()
862 .user_agent(user_agent.to_owned())
863 .timeout(timeout)
864 .connect_timeout(connect_timeout)
865 .redirect(redirect_policy);
866 if let Some(proxy_url) = proxy {
867 const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
871 if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
872 return Err(Error::HttpSetup {
873 message: format!(
874 "invalid proxy {proxy_url:?}: must start with one of {}",
875 SCHEMES.join(", ")
876 ),
877 });
878 }
879 let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
880 message: format!("invalid proxy {proxy_url:?}: {e}"),
881 })?;
882 builder = builder.proxy(proxy);
883 }
884 builder.build().map_err(|e| Error::HttpSetup {
885 message: e.to_string(),
886 })
887}
888
889pub const DEFAULT_BROWSER_BUDGET: usize = 50;
896
897pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
907
908impl fmt::Debug for Client {
909 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
910 f.debug_struct("Client")
911 .field("throttle", &self.throttle)
912 .field("global_throttle", &self.global_throttle)
913 .field("retry", &self.retry)
914 .field("user_agents", &self.user_agents)
915 .field("enrich", &self.enrich)
916 .field("robots", &self.robots.is_some())
917 .field("browser", &self.browser.is_some())
918 .field("browser_budget", &self.browser_budget)
919 .field("escalation_budget", &self.escalation_budget)
920 .field("escalation_enabled", &self.escalation_enabled)
921 .finish_non_exhaustive()
922 }
923}
924
925impl fmt::Debug for ClientBuilder {
926 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
927 f.debug_struct("ClientBuilder")
928 .field("timeout", &self.timeout)
929 .field("connect_timeout", &self.connect_timeout)
930 .field("user_agent", &self.user_agent)
931 .field("follow_redirects", &self.follow_redirects)
932 .field("redirect_limit", &self.redirect_limit)
933 .field("min_request_interval", &self.min_request_interval)
934 .field("max_rps", &self.max_rps)
935 .field("retry", &self.retry)
936 .field("proxy", &self.proxy)
937 .field("user_agents", &self.user_agents)
938 .field("enrich", &self.enrich)
939 .field("respect_robots", &self.respect_robots)
940 .field("browser", &self.browser.is_some())
941 .field("browser_budget", &self.browser_budget)
942 .field("egress", &self.egress)
943 .field("sessions", &self.sessions)
944 .field("escalation_budget", &self.escalation_budget)
945 .field("escalation_enabled", &self.escalation_enabled)
946 .finish()
947 }
948}
949
950const BOT_PROTECTED_TAG: &str = "bot-protected";
951
952fn default_user_agent() -> String {
953 format!("adler/{}", env!("CARGO_PKG_VERSION"))
954}
955
956fn host_of(url: &str) -> String {
957 reqwest::Url::parse(url)
958 .ok()
959 .and_then(|u| u.host_str().map(str::to_owned))
960 .unwrap_or_else(|| "unknown".into())
961}
962
963fn origin_and_path(url: &str) -> Option<(String, String)> {
966 let parsed = reqwest::Url::parse(url).ok()?;
967 let host = parsed.host_str()?;
968 let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
969 let origin = format!("{}://{host}{port}", parsed.scheme());
970 let path = parsed.query().map_or_else(
971 || parsed.path().to_owned(),
972 |q| format!("{}?{q}", parsed.path()),
973 );
974 Some((origin, path))
975}
976
977fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
978 CheckOutcome {
979 site: site.to_owned(),
980 url,
981 kind,
982 reason: None,
983 elapsed_ms: elapsed_ms(started),
984 enrichment: std::collections::BTreeMap::new(),
985 evidence: Vec::new(),
986 transport: None,
987 escalations: 0,
988 }
989}
990
991fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
992 CheckOutcome {
993 site: site.to_owned(),
994 url,
995 kind: MatchKind::Uncertain,
996 reason: Some(reason),
997 elapsed_ms: elapsed_ms(started),
998 enrichment: std::collections::BTreeMap::new(),
999 evidence: Vec::new(),
1000 transport: None,
1001 escalations: 0,
1002 }
1003}
1004
1005fn elapsed_ms(started: Instant) -> u64 {
1006 u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
1007}
1008
1009#[cfg(test)]
1010mod tests {
1011 use super::*;
1012 use crate::browser::RenderedPage;
1013 use crate::site::{Signal, UrlTemplate};
1014 use wiremock::matchers::{any, method, path};
1015 use wiremock::{Mock, MockServer, ResponseTemplate};
1016
1017 fn build_client() -> Client {
1018 Client::builder()
1019 .timeout(Duration::from_secs(2))
1020 .min_request_interval(Duration::ZERO)
1023 .max_retries(0)
1026 .build()
1027 .expect("client builds")
1028 }
1029
1030 fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
1031 Site {
1032 name: "Mock".into(),
1033 url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
1034 signals,
1035 known_present: None,
1036 known_absent: None,
1037 extract: Vec::new(),
1038 tags: Vec::new(),
1039 request_headers: std::collections::BTreeMap::new(),
1040 regex_check: None,
1041 engine: None,
1042 strip_bad_char: None,
1043 request_method: crate::site::HttpMethod::Get,
1044 request_body: None,
1045 protection: Vec::new(),
1046 disabled: false,
1047 source: None,
1048 popularity: None,
1049 access: crate::AccessPolicy::default(),
1050 }
1051 }
1052
1053 fn user() -> Username {
1054 Username::new("alice").unwrap()
1055 }
1056
1057 #[tokio::test]
1058 async fn regex_check_short_circuits_before_any_request() {
1059 let server = MockServer::start().await;
1063 Mock::given(any())
1064 .respond_with(ResponseTemplate::new(200))
1065 .mount(&server)
1066 .await;
1067 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1068 site.regex_check = Some("^[A-Za-z]{8,}$".into());
1070 let outcome = build_client().check(&site, &user()).await;
1071 assert_eq!(outcome.kind, MatchKind::Uncertain);
1072 assert!(
1073 matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
1074 "expected UsernameNotAllowed, got {:?}",
1075 outcome.reason,
1076 );
1077 let recvd = server.received_requests().await.unwrap_or_default();
1080 assert_eq!(
1081 recvd.len(),
1082 0,
1083 "regex_check mismatch must skip the HTTP request entirely"
1084 );
1085 }
1086
1087 #[tokio::test]
1088 async fn geo_constrained_site_with_no_egress_is_geo_unavailable() {
1089 let server = MockServer::start().await;
1092 Mock::given(any())
1093 .respond_with(ResponseTemplate::new(200))
1094 .mount(&server)
1095 .await;
1096 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1097 site.access = crate::access::AccessPolicy {
1100 geo: vec![crate::access::CountryCode::new("pl").unwrap()],
1101 ip_type: None,
1102 session: None,
1103 };
1104 let outcome = build_client().check(&site, &user()).await;
1105 assert_eq!(outcome.kind, MatchKind::Uncertain);
1106 assert!(
1107 matches!(outcome.reason, Some(UncertainReason::GeoUnavailable)),
1108 "expected GeoUnavailable, got {:?}",
1109 outcome.reason,
1110 );
1111 let recvd = server.received_requests().await.unwrap_or_default();
1114 assert_eq!(
1115 recvd.len(),
1116 0,
1117 "geo-unavailable must skip the HTTP request entirely"
1118 );
1119 }
1120
1121 #[tokio::test]
1122 async fn session_headers_are_sent_on_probe() {
1123 let server = MockServer::start().await;
1126 Mock::given(any())
1127 .and(wiremock::matchers::header("cookie", "sessionid=real"))
1128 .respond_with(ResponseTemplate::new(200))
1129 .mount(&server)
1130 .await;
1131 let mut headers = std::collections::BTreeMap::new();
1132 headers.insert("Cookie".to_string(), "sessionid=real".to_string());
1133 let mut store = SessionStore::new();
1134 store.insert("acct", crate::access::Session::from_headers(headers));
1135 let client = Client::builder()
1136 .timeout(Duration::from_secs(2))
1137 .min_request_interval(Duration::ZERO)
1138 .max_retries(0)
1139 .sessions(store)
1140 .build()
1141 .expect("client builds");
1142 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1143 site.access.session = Some("acct".to_string());
1144 let outcome = client.check(&site, &user()).await;
1145 assert_eq!(
1146 outcome.kind,
1147 MatchKind::Found,
1148 "session cookie should unlock the 200 (got {:?})",
1149 outcome.reason,
1150 );
1151 }
1152
1153 #[tokio::test]
1154 async fn missing_named_session_is_session_required() {
1155 let server = MockServer::start().await;
1156 Mock::given(any())
1157 .respond_with(ResponseTemplate::new(200))
1158 .mount(&server)
1159 .await;
1160 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1161 site.access.session = Some("not-configured".to_string());
1163 let outcome = build_client().check(&site, &user()).await;
1164 assert_eq!(outcome.kind, MatchKind::Uncertain);
1165 assert!(
1166 matches!(outcome.reason, Some(UncertainReason::SessionRequired)),
1167 "expected SessionRequired, got {:?}",
1168 outcome.reason,
1169 );
1170 let recvd = server.received_requests().await.unwrap_or_default();
1171 assert_eq!(
1172 recvd.len(),
1173 0,
1174 "a missing session must skip the request, not probe unauthenticated"
1175 );
1176 }
1177
1178 #[cfg(feature = "impersonate")]
1179 #[tokio::test]
1180 async fn impersonate_routes_pure_tls_fingerprint_site() {
1181 let server = MockServer::start().await;
1182 Mock::given(any())
1183 .respond_with(ResponseTemplate::new(200))
1184 .mount(&server)
1185 .await;
1186 let client = Client::builder()
1187 .timeout(Duration::from_secs(2))
1188 .min_request_interval(Duration::ZERO)
1189 .max_retries(0)
1190 .build()
1191 .expect("client builds with impersonate");
1192 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1193 site.protection = vec![crate::site::ProtectionKind::TlsFingerprint];
1196 let outcome = client.check(&site, &user()).await;
1197 assert_eq!(
1198 outcome.kind,
1199 MatchKind::Found,
1200 "expected Found (reason {:?})",
1201 outcome.reason,
1202 );
1203 let recvd = server.received_requests().await.expect("received requests");
1207 assert_eq!(recvd.len(), 1, "expected exactly one request");
1208 let ua = recvd[0]
1209 .headers
1210 .get("user-agent")
1211 .and_then(|v| v.to_str().ok())
1212 .unwrap_or("");
1213 assert!(
1214 ua.contains("Chrome/"),
1215 "expected Chrome-shaped UA from wreq, got {ua:?}"
1216 );
1217 }
1218
1219 #[tokio::test]
1220 async fn regex_check_pass_proceeds_to_probe() {
1221 let server = MockServer::start().await;
1222 Mock::given(any())
1223 .and(path("/alice"))
1224 .respond_with(ResponseTemplate::new(200))
1225 .mount(&server)
1226 .await;
1227 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1228 site.regex_check = Some("^[a-z]{3,}$".into());
1230 let outcome = build_client().check(&site, &user()).await;
1231 assert_eq!(outcome.kind, MatchKind::Found);
1232 }
1233
1234 #[tokio::test]
1235 async fn status_signal_reports_found_on_match() {
1236 let server = MockServer::start().await;
1237 Mock::given(any())
1238 .and(path("/alice"))
1239 .respond_with(ResponseTemplate::new(200))
1240 .mount(&server)
1241 .await;
1242 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1243 let outcome = build_client().check(&site, &user()).await;
1244 assert_eq!(outcome.kind, MatchKind::Found);
1245 assert!(outcome.url.ends_with("/alice"));
1246 assert!(outcome.reason.is_none());
1247 assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
1248 }
1249
1250 #[tokio::test]
1251 async fn status_signal_pair_reports_not_found_on_404() {
1252 let server = MockServer::start().await;
1253 Mock::given(any())
1254 .and(path("/alice"))
1255 .respond_with(ResponseTemplate::new(404))
1256 .mount(&server)
1257 .await;
1258 let site = site_with(
1259 &server,
1260 vec![
1261 Signal::StatusFound { codes: vec![200] },
1262 Signal::StatusNotFound { codes: vec![404] },
1263 ],
1264 );
1265 let outcome = build_client().check(&site, &user()).await;
1266 assert_eq!(outcome.kind, MatchKind::NotFound);
1267 assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
1269 }
1270
1271 #[tokio::test]
1272 async fn body_absent_signal_detects_missing_account() {
1273 let server = MockServer::start().await;
1274 Mock::given(any())
1275 .and(path("/alice"))
1276 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
1277 .mount(&server)
1278 .await;
1279 let site = site_with(
1280 &server,
1281 vec![Signal::BodyAbsent {
1282 text: "Profile not found".into(),
1283 }],
1284 );
1285 let outcome = build_client().check(&site, &user()).await;
1286 assert_eq!(outcome.kind, MatchKind::NotFound);
1287 }
1288
1289 #[tokio::test]
1290 async fn body_absent_alone_yields_uncertain_when_marker_missing() {
1291 let server = MockServer::start().await;
1294 Mock::given(any())
1295 .and(path("/alice"))
1296 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
1297 .mount(&server)
1298 .await;
1299 let site = site_with(
1300 &server,
1301 vec![Signal::BodyAbsent {
1302 text: "Profile not found".into(),
1303 }],
1304 );
1305 let outcome = build_client().check(&site, &user()).await;
1306 assert_eq!(outcome.kind, MatchKind::Uncertain);
1307 }
1308
1309 #[tokio::test]
1310 async fn body_present_plus_absent_resolve_to_found() {
1311 let server = MockServer::start().await;
1312 Mock::given(any())
1313 .and(path("/alice"))
1314 .respond_with(
1315 ResponseTemplate::new(200)
1316 .set_body_string(r#"<div class="profile-card">alice</div>"#),
1317 )
1318 .mount(&server)
1319 .await;
1320 let site = site_with(
1321 &server,
1322 vec![
1323 Signal::BodyPresent {
1324 text: "profile-card".into(),
1325 },
1326 Signal::BodyAbsent {
1327 text: "Profile not found".into(),
1328 },
1329 ],
1330 );
1331 let outcome = build_client().check(&site, &user()).await;
1332 assert_eq!(outcome.kind, MatchKind::Found);
1333 }
1334
1335 #[tokio::test]
1336 async fn redirect_absent_signal_detects_missing_account() {
1337 let server = MockServer::start().await;
1338 Mock::given(any())
1339 .and(path("/alice"))
1340 .respond_with(
1341 ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
1342 )
1343 .mount(&server)
1344 .await;
1345 Mock::given(any())
1346 .and(path("/login"))
1347 .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
1348 .mount(&server)
1349 .await;
1350 let site = site_with(
1351 &server,
1352 vec![Signal::RedirectAbsent {
1353 fragment: "/login".into(),
1354 }],
1355 );
1356 let outcome = build_client().check(&site, &user()).await;
1357 assert_eq!(outcome.kind, MatchKind::NotFound);
1358 }
1359
1360 #[tokio::test]
1361 async fn negative_signal_wins_over_positive() {
1362 let server = MockServer::start().await;
1367 Mock::given(any())
1368 .and(path("/alice"))
1369 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1370 .mount(&server)
1371 .await;
1372 let site = site_with(
1373 &server,
1374 vec![
1375 Signal::StatusFound { codes: vec![200] },
1376 Signal::BodyAbsent {
1377 text: "Profile not found".into(),
1378 },
1379 ],
1380 );
1381 let outcome = build_client().check(&site, &user()).await;
1382 assert_eq!(outcome.kind, MatchKind::NotFound);
1383 }
1384
1385 #[tokio::test]
1386 async fn network_failure_yields_uncertain() {
1387 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1388 let port = listener.local_addr().unwrap().port();
1389 drop(listener);
1390
1391 let site = Site {
1392 name: "Dead".into(),
1393 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1394 signals: vec![Signal::StatusFound { codes: vec![200] }],
1395 known_present: None,
1396 known_absent: None,
1397 extract: Vec::new(),
1398 tags: Vec::new(),
1399 request_headers: std::collections::BTreeMap::new(),
1400 regex_check: None,
1401 engine: None,
1402 strip_bad_char: None,
1403 request_method: crate::site::HttpMethod::Get,
1404 request_body: None,
1405 protection: Vec::new(),
1406 disabled: false,
1407 source: None,
1408 popularity: None,
1409 access: crate::AccessPolicy::default(),
1410 };
1411 let client = Client::builder()
1412 .timeout(Duration::from_millis(500))
1413 .connect_timeout(Duration::from_millis(500))
1414 .max_retries(0)
1415 .build()
1416 .unwrap();
1417 let outcome = client.check(&site, &user()).await;
1418 assert_eq!(outcome.kind, MatchKind::Uncertain);
1419 assert!(outcome.reason.is_some());
1420 }
1421
1422 #[tokio::test]
1423 async fn throttle_spaces_consecutive_calls_to_same_host() {
1424 let server = MockServer::start().await;
1425 Mock::given(any())
1426 .and(path("/alice"))
1427 .respond_with(ResponseTemplate::new(200))
1428 .mount(&server)
1429 .await;
1430 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1431 let client = Client::builder()
1436 .timeout(Duration::from_secs(2))
1437 .min_request_interval(Duration::from_millis(300))
1438 .build()
1439 .unwrap();
1440
1441 client.check(&site, &user()).await;
1442 let started = Instant::now();
1443 client.check(&site, &user()).await;
1444 let elapsed = started.elapsed();
1445 assert!(
1446 elapsed >= Duration::from_millis(200),
1447 "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1448 );
1449 }
1450
1451 #[tokio::test]
1452 async fn builder_overrides_user_agent() {
1453 let server = MockServer::start().await;
1454 Mock::given(any())
1455 .and(path("/alice"))
1456 .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1457 .respond_with(ResponseTemplate::new(200))
1458 .mount(&server)
1459 .await;
1460 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1461 let client = Client::builder()
1462 .user_agent("adler-test/1.0")
1463 .build()
1464 .unwrap();
1465 let outcome = client.check(&site, &user()).await;
1466 assert_eq!(outcome.kind, MatchKind::Found);
1467 }
1468
1469 #[tokio::test]
1470 async fn rate_limit_429_yields_uncertain_with_note() {
1471 let server = MockServer::start().await;
1472 Mock::given(any())
1473 .and(path("/alice"))
1474 .respond_with(ResponseTemplate::new(429))
1475 .mount(&server)
1476 .await;
1477 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1478 let outcome = build_client().check(&site, &user()).await;
1479 assert_eq!(outcome.kind, MatchKind::Uncertain);
1480 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1481 }
1482
1483 #[tokio::test]
1484 async fn cloudflare_server_header_yields_uncertain() {
1485 let server = MockServer::start().await;
1486 Mock::given(any())
1487 .and(path("/alice"))
1488 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1489 .mount(&server)
1490 .await;
1491 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1492 let outcome = build_client().check(&site, &user()).await;
1493 assert_eq!(outcome.kind, MatchKind::Uncertain);
1494 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1495 }
1496
1497 #[tokio::test]
1498 async fn cloudflare_interstitial_in_body_yields_uncertain() {
1499 let server = MockServer::start().await;
1502 Mock::given(any())
1503 .and(path("/alice"))
1504 .respond_with(
1505 ResponseTemplate::new(200)
1506 .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1507 )
1508 .mount(&server)
1509 .await;
1510 let site = site_with(
1511 &server,
1512 vec![Signal::BodyAbsent {
1513 text: "Profile not found".into(),
1514 }],
1515 );
1516 let outcome = build_client().check(&site, &user()).await;
1517 assert_eq!(outcome.kind, MatchKind::Uncertain);
1518 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1519 }
1520
1521 #[tokio::test]
1522 async fn ban_detection_does_not_fire_on_legitimate_403() {
1523 let server = MockServer::start().await;
1524 Mock::given(any())
1525 .and(path("/alice"))
1526 .respond_with(ResponseTemplate::new(403))
1527 .mount(&server)
1528 .await;
1529 let site = site_with(
1530 &server,
1531 vec![
1532 Signal::StatusFound { codes: vec![200] },
1533 Signal::StatusNotFound { codes: vec![403] },
1534 ],
1535 );
1536 let outcome = build_client().check(&site, &user()).await;
1537 assert_eq!(outcome.kind, MatchKind::NotFound);
1539 assert!(outcome.reason.is_none());
1540 }
1541
1542 #[tokio::test]
1543 async fn retry_recovers_after_transient_429() {
1544 let server = MockServer::start().await;
1545 Mock::given(any())
1547 .and(path("/alice"))
1548 .respond_with(ResponseTemplate::new(429))
1549 .up_to_n_times(1)
1550 .mount(&server)
1551 .await;
1552 Mock::given(any())
1553 .and(path("/alice"))
1554 .respond_with(ResponseTemplate::new(200))
1555 .mount(&server)
1556 .await;
1557 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1558 let client = Client::builder()
1559 .timeout(Duration::from_secs(2))
1560 .min_request_interval(Duration::ZERO)
1561 .max_retries(2)
1562 .base_backoff_delay(Duration::from_millis(20))
1563 .max_backoff_delay(Duration::from_millis(100))
1564 .build()
1565 .unwrap();
1566 let outcome = client.check(&site, &user()).await;
1567 assert_eq!(outcome.kind, MatchKind::Found);
1568 assert!(outcome.reason.is_none());
1569 }
1570
1571 #[tokio::test]
1572 async fn retry_exhausts_and_returns_uncertain() {
1573 let server = MockServer::start().await;
1574 Mock::given(any())
1575 .and(path("/alice"))
1576 .respond_with(ResponseTemplate::new(429))
1577 .mount(&server)
1578 .await;
1579 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1580 let client = Client::builder()
1581 .timeout(Duration::from_secs(2))
1582 .min_request_interval(Duration::ZERO)
1583 .max_retries(2)
1584 .base_backoff_delay(Duration::from_millis(10))
1585 .max_backoff_delay(Duration::from_millis(50))
1586 .build()
1587 .unwrap();
1588 let outcome = client.check(&site, &user()).await;
1589 assert_eq!(outcome.kind, MatchKind::Uncertain);
1590 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1591 }
1592
1593 #[tokio::test]
1594 async fn retry_does_not_fire_on_network_error() {
1595 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1599 let port = listener.local_addr().unwrap().port();
1600 drop(listener);
1601 let site = Site {
1602 name: "Dead".into(),
1603 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1604 signals: vec![Signal::StatusFound { codes: vec![200] }],
1605 known_present: None,
1606 known_absent: None,
1607 extract: Vec::new(),
1608 tags: Vec::new(),
1609 request_headers: std::collections::BTreeMap::new(),
1610 regex_check: None,
1611 engine: None,
1612 strip_bad_char: None,
1613 request_method: crate::site::HttpMethod::Get,
1614 request_body: None,
1615 protection: Vec::new(),
1616 disabled: false,
1617 source: None,
1618 popularity: None,
1619 access: crate::AccessPolicy::default(),
1620 };
1621 let client = Client::builder()
1622 .timeout(Duration::from_millis(500))
1623 .connect_timeout(Duration::from_millis(500))
1624 .min_request_interval(Duration::ZERO)
1625 .max_retries(3)
1626 .base_backoff_delay(Duration::from_secs(60))
1627 .build()
1628 .unwrap();
1629 let started = Instant::now();
1630 let outcome = client.check(&site, &user()).await;
1631 assert!(started.elapsed() < Duration::from_secs(5));
1634 assert_eq!(outcome.kind, MatchKind::Uncertain);
1635 assert!(
1636 matches!(outcome.reason, Some(UncertainReason::Network(_))),
1637 "got {:?}",
1638 outcome.reason,
1639 );
1640 }
1641
1642 #[tokio::test]
1643 async fn rotates_user_agent_per_request() {
1644 let server = MockServer::start().await;
1648 Mock::given(any())
1649 .and(path("/alice"))
1650 .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1651 .respond_with(ResponseTemplate::new(200))
1652 .mount(&server)
1653 .await;
1654 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1655 let client = Client::builder()
1656 .min_request_interval(Duration::ZERO)
1657 .max_retries(0)
1658 .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1659 .build()
1660 .unwrap();
1661 let outcome = client.check(&site, &user()).await;
1662 assert_eq!(outcome.kind, MatchKind::Found);
1663 }
1664
1665 #[test]
1666 fn invalid_proxy_url_fails_build() {
1667 let err = Client::builder().proxy("not a url").build().unwrap_err();
1668 assert!(matches!(err, Error::HttpSetup { .. }));
1669 }
1670
1671 #[test]
1672 fn schemeless_proxy_is_rejected_up_front() {
1673 let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1675 let Error::HttpSetup { message } = err else {
1676 panic!("expected HttpSetup, got {err:?}");
1677 };
1678 assert!(message.contains("must start with"), "{message}");
1679 }
1680
1681 #[test]
1682 fn socks5_proxy_scheme_is_accepted() {
1683 assert!(
1685 Client::builder()
1686 .proxy("socks5://127.0.0.1:9050")
1687 .build()
1688 .is_ok()
1689 );
1690 }
1691
1692 #[tokio::test]
1693 async fn global_rps_cap_spaces_requests_across_hosts() {
1694 let server = MockServer::start().await;
1697 Mock::given(any())
1698 .respond_with(ResponseTemplate::new(200))
1699 .mount(&server)
1700 .await;
1701 let site_a = Site {
1702 name: "A".into(),
1703 url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1704 signals: vec![Signal::StatusFound { codes: vec![200] }],
1705 known_present: None,
1706 known_absent: None,
1707 extract: Vec::new(),
1708 tags: Vec::new(),
1709 request_headers: std::collections::BTreeMap::new(),
1710 regex_check: None,
1711 engine: None,
1712 strip_bad_char: None,
1713 request_method: crate::site::HttpMethod::Get,
1714 request_body: None,
1715 protection: Vec::new(),
1716 disabled: false,
1717 source: None,
1718 popularity: None,
1719 access: crate::AccessPolicy::default(),
1720 };
1721 let site_b = Site {
1722 name: "B".into(),
1723 url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1724 signals: vec![Signal::StatusFound { codes: vec![200] }],
1725 known_present: None,
1726 known_absent: None,
1727 extract: Vec::new(),
1728 tags: Vec::new(),
1729 request_headers: std::collections::BTreeMap::new(),
1730 regex_check: None,
1731 engine: None,
1732 strip_bad_char: None,
1733 request_method: crate::site::HttpMethod::Get,
1734 request_body: None,
1735 protection: Vec::new(),
1736 disabled: false,
1737 source: None,
1738 popularity: None,
1739 access: crate::AccessPolicy::default(),
1740 };
1741 let client = Client::builder()
1746 .min_request_interval(Duration::ZERO)
1747 .max_retries(0)
1748 .max_rps(std::num::NonZeroU32::new(2).unwrap())
1749 .build()
1750 .unwrap();
1751 client.check(&site_a, &user()).await;
1754 let started = Instant::now();
1755 client.check(&site_b, &user()).await;
1756 assert!(
1757 started.elapsed() >= Duration::from_millis(350),
1758 "global cap should space cross-host requests, got {:?}",
1759 started.elapsed(),
1760 );
1761 }
1762
1763 #[tokio::test]
1764 async fn respect_robots_skips_disallowed_paths() {
1765 let server = MockServer::start().await;
1766 Mock::given(any())
1767 .and(path("/robots.txt"))
1768 .respond_with(
1769 ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1770 )
1771 .mount(&server)
1772 .await;
1773 Mock::given(any())
1774 .and(path("/no/alice"))
1775 .respond_with(ResponseTemplate::new(200))
1776 .mount(&server)
1777 .await;
1778 Mock::given(any())
1779 .and(path("/yes/alice"))
1780 .respond_with(ResponseTemplate::new(200))
1781 .mount(&server)
1782 .await;
1783 let client = Client::builder()
1784 .min_request_interval(Duration::ZERO)
1785 .max_retries(0)
1786 .respect_robots(true)
1787 .build()
1788 .unwrap();
1789
1790 let disallowed = Site {
1791 name: "No".into(),
1792 url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1793 signals: vec![Signal::StatusFound { codes: vec![200] }],
1794 known_present: None,
1795 known_absent: None,
1796 extract: Vec::new(),
1797 tags: Vec::new(),
1798 request_headers: std::collections::BTreeMap::new(),
1799 regex_check: None,
1800 engine: None,
1801 strip_bad_char: None,
1802 request_method: crate::site::HttpMethod::Get,
1803 request_body: None,
1804 protection: Vec::new(),
1805 disabled: false,
1806 source: None,
1807 popularity: None,
1808 access: crate::AccessPolicy::default(),
1809 };
1810 let allowed = Site {
1811 name: "Yes".into(),
1812 url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1813 signals: vec![Signal::StatusFound { codes: vec![200] }],
1814 known_present: None,
1815 known_absent: None,
1816 extract: Vec::new(),
1817 tags: Vec::new(),
1818 request_headers: std::collections::BTreeMap::new(),
1819 regex_check: None,
1820 engine: None,
1821 strip_bad_char: None,
1822 request_method: crate::site::HttpMethod::Get,
1823 request_body: None,
1824 protection: Vec::new(),
1825 disabled: false,
1826 source: None,
1827 popularity: None,
1828 access: crate::AccessPolicy::default(),
1829 };
1830
1831 let no = client.check(&disallowed, &user()).await;
1832 assert_eq!(no.kind, MatchKind::Uncertain);
1833 assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1834
1835 let yes = client.check(&allowed, &user()).await;
1836 assert_eq!(yes.kind, MatchKind::Found);
1837 }
1838
1839 #[tokio::test]
1840 async fn body_read_skipped_when_no_body_signal_needed() {
1841 let server = MockServer::start().await;
1844 Mock::given(any())
1845 .and(path("/alice"))
1846 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1847 .mount(&server)
1848 .await;
1849 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1850 let outcome = build_client().check(&site, &user()).await;
1851 assert_eq!(outcome.kind, MatchKind::Found);
1852 }
1853
1854 #[derive(Debug)]
1860 struct RecordingBackend {
1861 page: RenderedPage,
1862 calls: std::sync::atomic::AtomicUsize,
1863 }
1864
1865 impl RecordingBackend {
1866 fn with_page(page: RenderedPage) -> Self {
1867 Self {
1868 page,
1869 calls: std::sync::atomic::AtomicUsize::new(0),
1870 }
1871 }
1872 fn call_count(&self) -> usize {
1873 self.calls.load(std::sync::atomic::Ordering::SeqCst)
1874 }
1875 }
1876
1877 #[async_trait::async_trait]
1878 impl BrowserBackend for RecordingBackend {
1879 async fn fetch(
1880 &self,
1881 _url: &url::Url,
1882 _headers: &std::collections::BTreeMap<String, String>,
1883 _timeout: Duration,
1884 ) -> Result<RenderedPage> {
1885 self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1886 Ok(self.page.clone())
1887 }
1888 }
1889
1890 fn site_bot_protected(server: &MockServer) -> Site {
1891 let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1892 s.tags = vec!["bot-protected".into()];
1893 s
1894 }
1895
1896 #[tokio::test]
1897 async fn browser_routes_bot_protected_sites() {
1898 let server = MockServer::start().await;
1901 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1902 status: 200,
1903 final_url: url::Url::parse("https://example.com/alice").unwrap(),
1904 body: "<html></html>".into(),
1905 elapsed_ms: 42,
1906 }));
1907 let client = Client::builder()
1908 .min_request_interval(Duration::ZERO)
1909 .max_retries(0)
1910 .browser(backend.clone())
1911 .build()
1912 .unwrap();
1913 let outcome = client.check(&site_bot_protected(&server), &user()).await;
1914 assert_eq!(outcome.kind, MatchKind::Found);
1915 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1916 }
1917
1918 #[tokio::test]
1919 async fn non_bot_protected_sites_skip_browser() {
1920 let server = MockServer::start().await;
1921 Mock::given(any())
1922 .and(path("/alice"))
1923 .respond_with(ResponseTemplate::new(200))
1924 .mount(&server)
1925 .await;
1926 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1927 status: 500, final_url: url::Url::parse("https://x/").unwrap(),
1929 body: String::new(),
1930 elapsed_ms: 0,
1931 }));
1932 let client = Client::builder()
1933 .min_request_interval(Duration::ZERO)
1934 .max_retries(0)
1935 .browser(backend.clone())
1936 .build()
1937 .unwrap();
1938 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1940 let outcome = client.check(&site, &user()).await;
1941 assert_eq!(outcome.kind, MatchKind::Found);
1942 assert_eq!(backend.call_count(), 0, "browser must not be touched");
1943 }
1944
1945 #[tokio::test]
1946 async fn browser_budget_exhaust_yields_uncertain() {
1947 let server = MockServer::start().await;
1948 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1949 status: 200,
1950 final_url: url::Url::parse("https://x/").unwrap(),
1951 body: String::new(),
1952 elapsed_ms: 0,
1953 }));
1954 let client = Client::builder()
1955 .min_request_interval(Duration::ZERO)
1956 .max_retries(0)
1957 .browser(backend.clone())
1958 .browser_budget(1)
1959 .build()
1960 .unwrap();
1961 let site = site_bot_protected(&server);
1962 let first = client.check(&site, &user()).await;
1964 assert_eq!(first.kind, MatchKind::Found);
1965 let second = client.check(&site, &user()).await;
1967 assert_eq!(second.kind, MatchKind::Uncertain);
1968 assert!(matches!(
1969 second.reason,
1970 Some(UncertainReason::BrowserBudget)
1971 ));
1972 assert_eq!(
1973 backend.call_count(),
1974 1,
1975 "second call must not invoke backend"
1976 );
1977 }
1978
1979 #[tokio::test]
1980 async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1981 struct FailingBackend;
1982 #[async_trait::async_trait]
1983 impl BrowserBackend for FailingBackend {
1984 async fn fetch(
1985 &self,
1986 _url: &url::Url,
1987 _headers: &std::collections::BTreeMap<String, String>,
1988 _timeout: Duration,
1989 ) -> Result<RenderedPage> {
1990 Err(Error::BrowserSetup {
1991 message: "simulated crash".into(),
1992 })
1993 }
1994 }
1995 impl std::fmt::Debug for FailingBackend {
1996 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1997 f.write_str("FailingBackend")
1998 }
1999 }
2000
2001 let server = MockServer::start().await;
2002 let client = Client::builder()
2003 .min_request_interval(Duration::ZERO)
2004 .max_retries(0)
2005 .browser(Arc::new(FailingBackend))
2006 .build()
2007 .unwrap();
2008 let outcome = client.check(&site_bot_protected(&server), &user()).await;
2009 assert_eq!(outcome.kind, MatchKind::Uncertain);
2010 match outcome.reason {
2011 Some(UncertainReason::BrowserFailed(msg)) => {
2012 assert!(msg.contains("simulated crash"), "got: {msg}");
2013 }
2014 other => panic!("expected BrowserFailed, got {other:?}"),
2015 }
2016 }
2017
2018 #[tokio::test]
2019 async fn status_only_site_uses_head_request() {
2020 let server = MockServer::start().await;
2024 Mock::given(method("HEAD"))
2025 .and(path("/alice"))
2026 .respond_with(ResponseTemplate::new(200))
2027 .mount(&server)
2028 .await;
2029 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2030 let outcome = build_client().check(&site, &user()).await;
2031 assert_eq!(outcome.kind, MatchKind::Found);
2032 let recvd = server.received_requests().await.unwrap_or_default();
2033 assert_eq!(recvd.len(), 1);
2034 assert_eq!(recvd[0].method.as_str(), "HEAD");
2035 }
2036
2037 #[tokio::test]
2038 async fn body_signal_site_uses_get_request() {
2039 let server = MockServer::start().await;
2042 Mock::given(any())
2043 .and(path("/alice"))
2044 .respond_with(ResponseTemplate::new(200).set_body_string("hello alice"))
2045 .mount(&server)
2046 .await;
2047 let site = site_with(
2048 &server,
2049 vec![Signal::BodyPresent {
2050 text: "hello".into(),
2051 }],
2052 );
2053 let outcome = build_client().check(&site, &user()).await;
2054 assert_eq!(outcome.kind, MatchKind::Found);
2055 let recvd = server.received_requests().await.unwrap_or_default();
2056 assert_eq!(recvd[0].method.as_str(), "GET");
2057 }
2058
2059 #[tokio::test]
2060 async fn protection_field_routes_through_browser_like_bot_protected_tag() {
2061 let server = MockServer::start().await;
2066 Mock::given(any())
2067 .respond_with(ResponseTemplate::new(200))
2068 .mount(&server)
2069 .await;
2070 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2071 site.protection = vec![crate::site::ProtectionKind::Cloudflare];
2072 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2074 status: 200,
2075 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2076 body: String::new(),
2077 elapsed_ms: 0,
2078 }));
2079 let client = Client::builder()
2080 .min_request_interval(Duration::ZERO)
2081 .max_retries(0)
2082 .browser(backend)
2083 .build()
2084 .unwrap();
2085 let outcome = client.check(&site, &user()).await;
2086 assert_eq!(outcome.kind, MatchKind::Found);
2089 let recvd = server.received_requests().await.unwrap_or_default();
2091 assert_eq!(
2092 recvd.len(),
2093 0,
2094 "structured protection must skip the raw HTTP path"
2095 );
2096 }
2097
2098 #[tokio::test]
2099 async fn post_method_sends_body_with_username_substituted() {
2100 let server = MockServer::start().await;
2104 Mock::given(method("POST"))
2105 .and(path("/api"))
2106 .respond_with(ResponseTemplate::new(200))
2107 .mount(&server)
2108 .await;
2109 let site = Site {
2114 name: "ApiPost".into(),
2115 url: UrlTemplate::new(format!("{}/api?_={{username}}", server.uri())).unwrap(),
2116 signals: vec![Signal::StatusFound { codes: vec![200] }],
2117 known_present: None,
2118 known_absent: None,
2119 extract: Vec::new(),
2120 tags: Vec::new(),
2121 request_headers: std::collections::BTreeMap::new(),
2122 regex_check: None,
2123 engine: None,
2124 strip_bad_char: None,
2125 request_method: HttpMethod::Post,
2126 request_body: Some(r#"{"name":"{username}"}"#.into()),
2127 protection: Vec::new(),
2128 disabled: false,
2129 source: None,
2130 popularity: None,
2131 access: crate::AccessPolicy::default(),
2132 };
2133 let outcome = build_client().check(&site, &user()).await;
2134 assert_eq!(outcome.kind, MatchKind::Found);
2135 let recvd = server.received_requests().await.unwrap_or_default();
2136 assert_eq!(recvd.len(), 1);
2137 assert_eq!(recvd[0].method.as_str(), "POST");
2138 let body = String::from_utf8_lossy(&recvd[0].body).to_string();
2139 assert!(body.contains("\"name\":\"alice\""), "body was: {body}");
2140 }
2141
2142 #[tokio::test]
2143 async fn head_405_falls_back_to_get() {
2144 let server = MockServer::start().await;
2147 Mock::given(method("HEAD"))
2148 .and(path("/alice"))
2149 .respond_with(ResponseTemplate::new(405))
2150 .mount(&server)
2151 .await;
2152 Mock::given(any())
2153 .and(path("/alice"))
2154 .respond_with(ResponseTemplate::new(200))
2155 .mount(&server)
2156 .await;
2157 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2158 let outcome = build_client().check(&site, &user()).await;
2159 assert_eq!(outcome.kind, MatchKind::Found);
2160 let recvd = server.received_requests().await.unwrap_or_default();
2161 assert_eq!(recvd.len(), 2);
2162 assert_eq!(recvd[0].method.as_str(), "HEAD");
2163 assert_eq!(recvd[1].method.as_str(), "GET");
2164 }
2165
2166 async fn cloudflare_503_server() -> MockServer {
2175 let server = MockServer::start().await;
2176 Mock::given(any())
2177 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
2178 .mount(&server)
2179 .await;
2180 server
2181 }
2182
2183 #[tokio::test]
2184 async fn http_success_stamps_http_transport_no_escalations() {
2185 let server = MockServer::start().await;
2186 Mock::given(any())
2187 .respond_with(ResponseTemplate::new(200))
2188 .mount(&server)
2189 .await;
2190 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2191 let outcome = build_client().check(&site, &user()).await;
2192 assert_eq!(outcome.kind, MatchKind::Found);
2193 assert_eq!(
2194 outcome.transport,
2195 Some(crate::escalation::TransportTier::Http),
2196 "successful HTTP probe must stamp Http transport"
2197 );
2198 assert_eq!(outcome.escalations, 0, "no escalation on the happy path");
2199 }
2200
2201 #[tokio::test]
2202 async fn escalates_cloudflare_uncertain_to_browser_and_stamps_one() {
2203 let server = cloudflare_503_server().await;
2204 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2206 status: 200,
2207 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2208 body: String::new(),
2209 elapsed_ms: 5,
2210 }));
2211 let client = Client::builder()
2212 .min_request_interval(Duration::ZERO)
2213 .max_retries(0)
2214 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2215 .build()
2216 .unwrap();
2217 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2220 let outcome = client.check(&site, &user()).await;
2221 assert_eq!(
2222 outcome.kind,
2223 MatchKind::Found,
2224 "escalation should flip CF challenge to Found via browser (reason {:?})",
2225 outcome.reason
2226 );
2227 assert_eq!(
2228 outcome.transport,
2229 Some(crate::escalation::TransportTier::Browser),
2230 "escalated outcome must be stamped Browser"
2231 );
2232 assert_eq!(
2233 outcome.escalations, 1,
2234 "exactly one escalation should have fired"
2235 );
2236 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
2237 }
2238
2239 #[tokio::test]
2240 async fn disable_escalation_leaves_cloudflare_uncertain_untouched() {
2241 let server = cloudflare_503_server().await;
2242 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2243 status: 200,
2244 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2245 body: String::new(),
2246 elapsed_ms: 0,
2247 }));
2248 let client = Client::builder()
2249 .min_request_interval(Duration::ZERO)
2250 .max_retries(0)
2251 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2252 .disable_escalation()
2253 .build()
2254 .unwrap();
2255 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2256 let outcome = client.check(&site, &user()).await;
2257 assert_eq!(outcome.kind, MatchKind::Uncertain);
2258 assert!(matches!(
2259 outcome.reason,
2260 Some(UncertainReason::CloudflareChallenge)
2261 ));
2262 assert_eq!(
2263 outcome.transport,
2264 Some(crate::escalation::TransportTier::Http),
2265 "primary transport must still be stamped"
2266 );
2267 assert_eq!(outcome.escalations, 0);
2268 assert_eq!(
2269 backend.call_count(),
2270 0,
2271 "browser must not be touched when --no-escalation"
2272 );
2273 }
2274
2275 #[tokio::test]
2276 async fn escalation_budget_zero_keeps_browser_untouched() {
2277 let server = cloudflare_503_server().await;
2278 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2279 status: 200,
2280 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2281 body: String::new(),
2282 elapsed_ms: 0,
2283 }));
2284 let client = Client::builder()
2285 .min_request_interval(Duration::ZERO)
2286 .max_retries(0)
2287 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2288 .escalation_budget(0)
2289 .build()
2290 .unwrap();
2291 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2292 let outcome = client.check(&site, &user()).await;
2293 assert_eq!(outcome.kind, MatchKind::Uncertain);
2294 assert!(matches!(
2295 outcome.reason,
2296 Some(UncertainReason::CloudflareChallenge)
2297 ));
2298 assert_eq!(outcome.escalations, 0);
2299 assert_eq!(
2300 backend.call_count(),
2301 0,
2302 "zero budget must deny every escalation"
2303 );
2304 }
2305
2306 #[tokio::test]
2307 async fn escalation_consumes_budget_then_stops() {
2308 let server = cloudflare_503_server().await;
2309 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
2310 status: 200,
2311 final_url: url::Url::parse(&format!("{}/alice", server.uri())).unwrap(),
2312 body: String::new(),
2313 elapsed_ms: 0,
2314 }));
2315 let client = Client::builder()
2316 .min_request_interval(Duration::ZERO)
2317 .max_retries(0)
2318 .browser(Arc::clone(&backend) as Arc<dyn BrowserBackend>)
2319 .escalation_budget(1)
2320 .build()
2321 .unwrap();
2322 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
2323 let first = client.check(&site, &user()).await;
2325 assert_eq!(first.kind, MatchKind::Found);
2326 assert_eq!(first.escalations, 1);
2327 let second = client.check(&site, &user()).await;
2329 assert_eq!(second.kind, MatchKind::Uncertain);
2330 assert!(matches!(
2331 second.reason,
2332 Some(UncertainReason::CloudflareChallenge)
2333 ));
2334 assert_eq!(second.escalations, 0);
2335 assert_eq!(backend.call_count(), 1, "browser called exactly once total");
2336 }
2337}