1use std::fmt;
11use std::num::NonZeroU32;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use reqwest::redirect;
16
17use crate::ban;
18use crate::browser::{BrowserBackend, BrowserBudget, RenderedPage};
19use crate::check::{CheckOutcome, MatchKind, UncertainReason};
20use crate::error::{Error, Result};
21use crate::retry::{self, RetryPolicy};
22use crate::robots::RobotsCache;
23use crate::site::{Probe, Signal, SignalVerdict, Site, aggregate};
24use crate::throttle::HostThrottle;
25use crate::username::Username;
26
27const DEFAULT_TIMEOUT: Duration = Duration::from_secs(10);
28const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
29const DEFAULT_REDIRECT_LIMIT: usize = 8;
30const DEFAULT_PER_HOST_INTERVAL: Duration = Duration::from_millis(100);
31const GLOBAL_THROTTLE_KEY: &str = "*global*";
33
34#[derive(Clone)]
42pub struct Client {
43 inner: reqwest::Client,
44 throttle: HostThrottle,
45 global_throttle: Option<HostThrottle>,
47 retry: RetryPolicy,
48 user_agents: Arc<[String]>,
51 enrich: bool,
53 robots: Option<RobotsCache>,
55 browser: Option<Arc<dyn BrowserBackend>>,
58 browser_budget: Arc<BrowserBudget>,
61}
62
63impl Client {
64 pub fn builder() -> ClientBuilder {
66 ClientBuilder::default()
67 }
68
69 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
83 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
84 let mut attempt: u32 = 0;
85 loop {
86 let outcome = self.probe_once(site, username).await;
87 if !retry::should_retry(&outcome, attempt, &self.retry) {
88 return outcome;
89 }
90 let delay = retry::backoff_delay(attempt, &self.retry);
91 tracing::info!(
92 site = %site.name,
93 attempt = attempt + 1,
94 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
95 ?delay,
96 "transient ban, retrying",
97 );
98 tokio::time::sleep(delay).await;
99 attempt += 1;
100 }
101 }
102
103 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
112 let host = host_of(url);
113 if let Some(global) = &self.global_throttle {
114 global.wait(GLOBAL_THROTTLE_KEY).await;
115 }
116 self.throttle.wait(&host).await;
117 let mut request = self.inner.get(url);
118 if let Some(ua) = self.pick_user_agent() {
119 request = request.header(reqwest::header::USER_AGENT, ua);
120 }
121 let response = request.send().await.ok()?;
122 let status = response.status().as_u16();
123 let final_url = response.url().to_string();
124 let body = response.text().await.unwrap_or_default();
125 Some(RawResponse {
126 status,
127 final_url,
128 body,
129 })
130 }
131
132 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
143 if let Some(backend) = self.browser.as_deref() {
144 if site
145 .tags
146 .iter()
147 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
148 {
149 let parsed = url::Url::parse(url).ok()?;
150 match backend
151 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
152 .await
153 {
154 Ok(page) => {
155 return Some(RawResponse {
156 status: page.status,
157 final_url: page.final_url.to_string(),
158 body: page.body,
159 });
160 }
161 Err(err) => {
162 tracing::warn!(
163 site = %site.name, %url, error = %err,
164 "browser fetch failed in doctor; falling back to raw HTTP",
165 );
166 }
167 }
168 }
169 }
170 self.fetch(url).await
171 }
172
173 fn pick_user_agent(&self) -> Option<&str> {
176 match self.user_agents.len() {
177 0 => None,
178 1 => Some(&self.user_agents[0]),
179 n => Some(&self.user_agents[fastrand::usize(0..n)]),
180 }
181 }
182
183 #[allow(clippy::too_many_lines)]
186 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
187 let url = site.url_for(username);
188
189 if let Some(pat) = &site.regex_check {
199 if let Ok(re) = regex::Regex::new(pat) {
200 if !re.is_match(username.as_str()) {
201 return uncertain(
202 &site.name,
203 url,
204 Instant::now(),
205 UncertainReason::UsernameNotAllowed,
206 );
207 }
208 }
209 }
210
211 if let Some(backend) = self.browser.as_deref() {
215 if site
216 .tags
217 .iter()
218 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
219 {
220 if self.browser_budget.try_consume() {
221 return self.probe_with_browser(site, &url, backend).await;
222 }
223 tracing::warn!(site = %site.name, "browser budget exhausted");
224 return uncertain(
225 &site.name,
226 url,
227 Instant::now(),
228 UncertainReason::BrowserBudget,
229 );
230 }
231 }
232
233 let host = host_of(&url);
234
235 if let Some(robots) = &self.robots {
237 if let Some((origin, path)) = origin_and_path(&url) {
238 if !robots.allowed(&origin, &path).await {
239 tracing::debug!(%url, "skipped by robots.txt");
240 return uncertain(
241 &site.name,
242 url,
243 Instant::now(),
244 UncertainReason::RobotsDisallowed,
245 );
246 }
247 }
248 }
249
250 if let Some(global) = &self.global_throttle {
252 global.wait(GLOBAL_THROTTLE_KEY).await;
253 }
254 self.throttle.wait(&host).await;
255 let started = Instant::now();
256 tracing::debug!(%url, %host, "probing");
257
258 let mut request = self.inner.get(&url);
259 if let Some(ua) = self.pick_user_agent() {
260 request = request.header(reqwest::header::USER_AGENT, ua);
261 }
262 let response = match request.send().await {
263 Ok(r) => r,
264 Err(err) => {
265 tracing::debug!(error = %err, "request failed");
266 return uncertain(
267 &site.name,
268 url,
269 started,
270 UncertainReason::Network(err.to_string()),
271 );
272 }
273 };
274
275 let status = response.status().as_u16();
276 let final_url = response.url().to_string();
277
278 if let Some(reason) = ban::detect_pre_body(status, response.headers()) {
279 tracing::warn!(%host, status, %reason, "ban-like response");
280 return uncertain(&site.name, url, started, reason);
281 }
282
283 let want_enrich = self.enrich && !site.extract.is_empty();
286 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
287 let body = if needs_body {
288 match response.text().await {
289 Ok(b) => b,
290 Err(err) => {
291 return uncertain(
292 &site.name,
293 url,
294 started,
295 UncertainReason::BodyRead(err.to_string()),
296 );
297 }
298 }
299 } else {
300 String::new()
301 };
302
303 if !body.is_empty() {
304 if let Some(reason) = ban::detect_in_body(&body) {
305 tracing::warn!(%host, %reason, "ban-like body");
306 return uncertain(&site.name, url, started, reason);
307 }
308 }
309
310 let probe = Probe {
311 status,
312 final_url: &final_url,
313 body: &body,
314 };
315 let votes: Vec<(&Signal, SignalVerdict)> = site
316 .signals
317 .iter()
318 .map(|s| (s, s.evaluate(&probe)))
319 .collect();
320 let kind = aggregate(votes.iter().map(|(_, v)| *v));
321 let mut result = outcome(&site.name, url, started, kind);
322 let winning = match kind {
324 MatchKind::Found => Some(SignalVerdict::Found),
325 MatchKind::NotFound => Some(SignalVerdict::NotFound),
326 MatchKind::Uncertain => None,
327 };
328 if let Some(want) = winning {
329 result.evidence = votes
330 .iter()
331 .filter(|(_, v)| *v == want)
332 .map(|(s, _)| s.describe_match(&probe))
333 .collect();
334 }
335 if want_enrich && kind == MatchKind::Found {
336 result.enrichment = crate::enrich::extract(&body, &site.extract);
337 }
338 result
339 }
340
341 async fn probe_with_browser(
346 &self,
347 site: &Site,
348 url: &str,
349 backend: &dyn BrowserBackend,
350 ) -> CheckOutcome {
351 let started = Instant::now();
352 let parsed = match url::Url::parse(url) {
353 Ok(u) => u,
354 Err(err) => {
355 return uncertain(
356 &site.name,
357 url.to_owned(),
358 started,
359 UncertainReason::Other(format!("invalid url: {err}")),
360 );
361 }
362 };
363
364 let page: RenderedPage = match backend
365 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
366 .await
367 {
368 Ok(p) => p,
369 Err(err) => {
370 tracing::warn!(site = %site.name, %url, error = %err, "browser fetch failed");
371 return uncertain(
372 &site.name,
373 url.to_owned(),
374 started,
375 UncertainReason::BrowserFailed(err.to_string()),
376 );
377 }
378 };
379
380 let final_url_str = page.final_url.as_str().to_owned();
381 let probe = Probe {
382 status: page.status,
383 final_url: &final_url_str,
384 body: &page.body,
385 };
386 let votes: Vec<(&Signal, SignalVerdict)> = site
387 .signals
388 .iter()
389 .map(|s| (s, s.evaluate(&probe)))
390 .collect();
391 let kind = aggregate(votes.iter().map(|(_, v)| *v));
392 let mut result = outcome(&site.name, url.to_owned(), started, kind);
393 let winning = match kind {
394 MatchKind::Found => Some(SignalVerdict::Found),
395 MatchKind::NotFound => Some(SignalVerdict::NotFound),
396 MatchKind::Uncertain => None,
397 };
398 if let Some(want) = winning {
399 result.evidence = votes
400 .iter()
401 .filter(|(_, v)| *v == want)
402 .map(|(s, _)| s.describe_match(&probe))
403 .collect();
404 }
405 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
406 result.enrichment = crate::enrich::extract(&page.body, &site.extract);
407 }
408 result
409 }
410}
411
412#[derive(Debug, Clone)]
414pub struct RawResponse {
415 pub status: u16,
417 pub final_url: String,
419 pub body: String,
421}
422
423#[derive(Clone)]
425#[must_use = "ClientBuilder does nothing until `.build()` is called"]
426pub struct ClientBuilder {
427 timeout: Duration,
428 connect_timeout: Duration,
429 user_agent: String,
430 follow_redirects: bool,
431 redirect_limit: usize,
432 min_request_interval: Duration,
433 max_rps: Option<NonZeroU32>,
434 retry: RetryPolicy,
435 proxy: Option<String>,
436 user_agents: Vec<String>,
437 enrich: bool,
438 respect_robots: bool,
439 browser: Option<Arc<dyn BrowserBackend>>,
440 browser_budget: usize,
441}
442
443impl Default for ClientBuilder {
444 fn default() -> Self {
445 Self {
446 timeout: DEFAULT_TIMEOUT,
447 connect_timeout: DEFAULT_CONNECT_TIMEOUT,
448 user_agent: default_user_agent(),
449 follow_redirects: true,
450 redirect_limit: DEFAULT_REDIRECT_LIMIT,
451 min_request_interval: DEFAULT_PER_HOST_INTERVAL,
452 max_rps: None,
453 retry: RetryPolicy::default(),
454 proxy: None,
455 user_agents: Vec::new(),
456 enrich: false,
457 respect_robots: false,
458 browser: None,
459 browser_budget: DEFAULT_BROWSER_BUDGET,
460 }
461 }
462}
463
464impl ClientBuilder {
465 pub fn timeout(mut self, timeout: Duration) -> Self {
467 self.timeout = timeout;
468 self
469 }
470
471 pub fn connect_timeout(mut self, timeout: Duration) -> Self {
473 self.connect_timeout = timeout;
474 self
475 }
476
477 pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
479 self.user_agent = user_agent.into();
480 self
481 }
482
483 pub fn follow_redirects(mut self, follow: bool) -> Self {
486 self.follow_redirects = follow;
487 self
488 }
489
490 pub fn min_request_interval(mut self, interval: Duration) -> Self {
496 self.min_request_interval = interval;
497 self
498 }
499
500 pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
505 self.max_rps = Some(rps);
506 self
507 }
508
509 pub fn max_retries(mut self, n: u32) -> Self {
512 self.retry.max_retries = n;
513 self
514 }
515
516 pub fn base_backoff_delay(mut self, d: Duration) -> Self {
519 self.retry.base_delay = d;
520 self
521 }
522
523 pub fn max_backoff_delay(mut self, d: Duration) -> Self {
525 self.retry.max_delay = d;
526 self
527 }
528
529 pub fn proxy(mut self, url: impl Into<String>) -> Self {
532 self.proxy = Some(url.into());
533 self
534 }
535
536 pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
540 self.user_agents = agents;
541 self
542 }
543
544 pub fn enrich(mut self, enrich: bool) -> Self {
547 self.enrich = enrich;
548 self
549 }
550
551 pub fn respect_robots(mut self, respect: bool) -> Self {
555 self.respect_robots = respect;
556 self
557 }
558
559 pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
563 self.browser = Some(backend);
564 self
565 }
566
567 pub const fn browser_budget(mut self, cap: usize) -> Self {
572 self.browser_budget = cap;
573 self
574 }
575
576 pub fn build(self) -> Result<Client> {
578 let redirect_policy = if self.follow_redirects {
579 redirect::Policy::limited(self.redirect_limit)
580 } else {
581 redirect::Policy::none()
582 };
583 let mut builder = reqwest::Client::builder()
584 .user_agent(self.user_agent)
585 .timeout(self.timeout)
586 .connect_timeout(self.connect_timeout)
587 .redirect(redirect_policy);
588 if let Some(proxy_url) = &self.proxy {
589 const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
593 if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
594 return Err(Error::HttpSetup {
595 message: format!(
596 "invalid proxy {proxy_url:?}: must start with one of {}",
597 SCHEMES.join(", ")
598 ),
599 });
600 }
601 let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
602 message: format!("invalid proxy {proxy_url:?}: {e}"),
603 })?;
604 builder = builder.proxy(proxy);
605 }
606 let inner = builder.build().map_err(|e| Error::HttpSetup {
607 message: e.to_string(),
608 })?;
609 let global_throttle = self.max_rps.map(|rps| {
610 let interval = Duration::from_secs(1) / rps.get();
612 HostThrottle::new(interval)
613 });
614 let robots = self
615 .respect_robots
616 .then(|| RobotsCache::new(inner.clone(), "adler"));
617 Ok(Client {
618 inner,
619 throttle: HostThrottle::new(self.min_request_interval),
620 global_throttle,
621 retry: self.retry,
622 user_agents: Arc::from(self.user_agents),
623 enrich: self.enrich,
624 robots,
625 browser: self.browser,
626 browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
627 })
628 }
629}
630
631pub const DEFAULT_BROWSER_BUDGET: usize = 50;
638
639impl fmt::Debug for Client {
640 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
641 f.debug_struct("Client")
642 .field("throttle", &self.throttle)
643 .field("global_throttle", &self.global_throttle)
644 .field("retry", &self.retry)
645 .field("user_agents", &self.user_agents)
646 .field("enrich", &self.enrich)
647 .field("robots", &self.robots.is_some())
648 .field("browser", &self.browser.is_some())
649 .field("browser_budget", &self.browser_budget)
650 .finish_non_exhaustive()
651 }
652}
653
654impl fmt::Debug for ClientBuilder {
655 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
656 f.debug_struct("ClientBuilder")
657 .field("timeout", &self.timeout)
658 .field("connect_timeout", &self.connect_timeout)
659 .field("user_agent", &self.user_agent)
660 .field("follow_redirects", &self.follow_redirects)
661 .field("redirect_limit", &self.redirect_limit)
662 .field("min_request_interval", &self.min_request_interval)
663 .field("max_rps", &self.max_rps)
664 .field("retry", &self.retry)
665 .field("proxy", &self.proxy)
666 .field("user_agents", &self.user_agents)
667 .field("enrich", &self.enrich)
668 .field("respect_robots", &self.respect_robots)
669 .field("browser", &self.browser.is_some())
670 .field("browser_budget", &self.browser_budget)
671 .finish()
672 }
673}
674
675const BROWSER_TIMEOUT: Duration = Duration::from_secs(60);
679
680const BOT_PROTECTED_TAG: &str = "bot-protected";
681
682fn default_user_agent() -> String {
683 format!("adler/{}", env!("CARGO_PKG_VERSION"))
684}
685
686fn host_of(url: &str) -> String {
687 reqwest::Url::parse(url)
688 .ok()
689 .and_then(|u| u.host_str().map(str::to_owned))
690 .unwrap_or_else(|| "unknown".into())
691}
692
693fn origin_and_path(url: &str) -> Option<(String, String)> {
696 let parsed = reqwest::Url::parse(url).ok()?;
697 let host = parsed.host_str()?;
698 let port = parsed.port().map_or_else(String::new, |p| format!(":{p}"));
699 let origin = format!("{}://{host}{port}", parsed.scheme());
700 let path = parsed.query().map_or_else(
701 || parsed.path().to_owned(),
702 |q| format!("{}?{q}", parsed.path()),
703 );
704 Some((origin, path))
705}
706
707fn outcome(site: &str, url: String, started: Instant, kind: MatchKind) -> CheckOutcome {
708 CheckOutcome {
709 site: site.to_owned(),
710 url,
711 kind,
712 reason: None,
713 elapsed_ms: elapsed_ms(started),
714 enrichment: std::collections::BTreeMap::new(),
715 evidence: Vec::new(),
716 }
717}
718
719fn uncertain(site: &str, url: String, started: Instant, reason: UncertainReason) -> CheckOutcome {
720 CheckOutcome {
721 site: site.to_owned(),
722 url,
723 kind: MatchKind::Uncertain,
724 reason: Some(reason),
725 elapsed_ms: elapsed_ms(started),
726 enrichment: std::collections::BTreeMap::new(),
727 evidence: Vec::new(),
728 }
729}
730
731fn elapsed_ms(started: Instant) -> u64 {
732 u64::try_from(started.elapsed().as_millis()).unwrap_or(u64::MAX)
733}
734
735#[cfg(test)]
736mod tests {
737 use super::*;
738 use crate::site::{Signal, UrlTemplate};
739 use wiremock::matchers::{method, path};
740 use wiremock::{Mock, MockServer, ResponseTemplate};
741
742 fn build_client() -> Client {
743 Client::builder()
744 .timeout(Duration::from_secs(2))
745 .min_request_interval(Duration::ZERO)
748 .max_retries(0)
751 .build()
752 .expect("client builds")
753 }
754
755 fn site_with(server: &MockServer, signals: Vec<Signal>) -> Site {
756 Site {
757 name: "Mock".into(),
758 url: UrlTemplate::new(format!("{}/{{username}}", server.uri())).unwrap(),
759 signals,
760 known_present: None,
761 known_absent: None,
762 extract: Vec::new(),
763 tags: Vec::new(),
764 request_headers: std::collections::BTreeMap::new(),
765 regex_check: None,
766 engine: None,
767 }
768 }
769
770 fn user() -> Username {
771 Username::new("alice").unwrap()
772 }
773
774 #[tokio::test]
775 async fn regex_check_short_circuits_before_any_request() {
776 let server = MockServer::start().await;
780 Mock::given(method("GET"))
781 .respond_with(ResponseTemplate::new(200))
782 .mount(&server)
783 .await;
784 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
785 site.regex_check = Some("^[A-Za-z]{8,}$".into());
787 let outcome = build_client().check(&site, &user()).await;
788 assert_eq!(outcome.kind, MatchKind::Uncertain);
789 assert!(
790 matches!(outcome.reason, Some(UncertainReason::UsernameNotAllowed)),
791 "expected UsernameNotAllowed, got {:?}",
792 outcome.reason,
793 );
794 let recvd = server.received_requests().await.unwrap_or_default();
797 assert_eq!(
798 recvd.len(),
799 0,
800 "regex_check mismatch must skip the HTTP request entirely"
801 );
802 }
803
804 #[tokio::test]
805 async fn regex_check_pass_proceeds_to_probe() {
806 let server = MockServer::start().await;
807 Mock::given(method("GET"))
808 .and(path("/alice"))
809 .respond_with(ResponseTemplate::new(200))
810 .mount(&server)
811 .await;
812 let mut site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
813 site.regex_check = Some("^[a-z]{3,}$".into());
815 let outcome = build_client().check(&site, &user()).await;
816 assert_eq!(outcome.kind, MatchKind::Found);
817 }
818
819 #[tokio::test]
820 async fn status_signal_reports_found_on_match() {
821 let server = MockServer::start().await;
822 Mock::given(method("GET"))
823 .and(path("/alice"))
824 .respond_with(ResponseTemplate::new(200))
825 .mount(&server)
826 .await;
827 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
828 let outcome = build_client().check(&site, &user()).await;
829 assert_eq!(outcome.kind, MatchKind::Found);
830 assert!(outcome.url.ends_with("/alice"));
831 assert!(outcome.reason.is_none());
832 assert_eq!(outcome.evidence, ["HTTP 200 (status_found)"]);
833 }
834
835 #[tokio::test]
836 async fn status_signal_pair_reports_not_found_on_404() {
837 let server = MockServer::start().await;
838 Mock::given(method("GET"))
839 .and(path("/alice"))
840 .respond_with(ResponseTemplate::new(404))
841 .mount(&server)
842 .await;
843 let site = site_with(
844 &server,
845 vec![
846 Signal::StatusFound { codes: vec![200] },
847 Signal::StatusNotFound { codes: vec![404] },
848 ],
849 );
850 let outcome = build_client().check(&site, &user()).await;
851 assert_eq!(outcome.kind, MatchKind::NotFound);
852 assert_eq!(outcome.evidence, ["HTTP 404 (status_not_found)"]);
854 }
855
856 #[tokio::test]
857 async fn body_absent_signal_detects_missing_account() {
858 let server = MockServer::start().await;
859 Mock::given(method("GET"))
860 .and(path("/alice"))
861 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Profile not found</h1>"))
862 .mount(&server)
863 .await;
864 let site = site_with(
865 &server,
866 vec![Signal::BodyAbsent {
867 text: "Profile not found".into(),
868 }],
869 );
870 let outcome = build_client().check(&site, &user()).await;
871 assert_eq!(outcome.kind, MatchKind::NotFound);
872 }
873
874 #[tokio::test]
875 async fn body_absent_alone_yields_uncertain_when_marker_missing() {
876 let server = MockServer::start().await;
879 Mock::given(method("GET"))
880 .and(path("/alice"))
881 .respond_with(ResponseTemplate::new(200).set_body_string("<h1>Welcome alice</h1>"))
882 .mount(&server)
883 .await;
884 let site = site_with(
885 &server,
886 vec![Signal::BodyAbsent {
887 text: "Profile not found".into(),
888 }],
889 );
890 let outcome = build_client().check(&site, &user()).await;
891 assert_eq!(outcome.kind, MatchKind::Uncertain);
892 }
893
894 #[tokio::test]
895 async fn body_present_plus_absent_resolve_to_found() {
896 let server = MockServer::start().await;
897 Mock::given(method("GET"))
898 .and(path("/alice"))
899 .respond_with(
900 ResponseTemplate::new(200)
901 .set_body_string(r#"<div class="profile-card">alice</div>"#),
902 )
903 .mount(&server)
904 .await;
905 let site = site_with(
906 &server,
907 vec![
908 Signal::BodyPresent {
909 text: "profile-card".into(),
910 },
911 Signal::BodyAbsent {
912 text: "Profile not found".into(),
913 },
914 ],
915 );
916 let outcome = build_client().check(&site, &user()).await;
917 assert_eq!(outcome.kind, MatchKind::Found);
918 }
919
920 #[tokio::test]
921 async fn redirect_absent_signal_detects_missing_account() {
922 let server = MockServer::start().await;
923 Mock::given(method("GET"))
924 .and(path("/alice"))
925 .respond_with(
926 ResponseTemplate::new(302).insert_header("location", "/login?next=/alice"),
927 )
928 .mount(&server)
929 .await;
930 Mock::given(method("GET"))
931 .and(path("/login"))
932 .respond_with(ResponseTemplate::new(200).set_body_string("login page"))
933 .mount(&server)
934 .await;
935 let site = site_with(
936 &server,
937 vec![Signal::RedirectAbsent {
938 fragment: "/login".into(),
939 }],
940 );
941 let outcome = build_client().check(&site, &user()).await;
942 assert_eq!(outcome.kind, MatchKind::NotFound);
943 }
944
945 #[tokio::test]
946 async fn negative_signal_wins_over_positive() {
947 let server = MockServer::start().await;
952 Mock::given(method("GET"))
953 .and(path("/alice"))
954 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
955 .mount(&server)
956 .await;
957 let site = site_with(
958 &server,
959 vec![
960 Signal::StatusFound { codes: vec![200] },
961 Signal::BodyAbsent {
962 text: "Profile not found".into(),
963 },
964 ],
965 );
966 let outcome = build_client().check(&site, &user()).await;
967 assert_eq!(outcome.kind, MatchKind::NotFound);
968 }
969
970 #[tokio::test]
971 async fn network_failure_yields_uncertain() {
972 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
973 let port = listener.local_addr().unwrap().port();
974 drop(listener);
975
976 let site = Site {
977 name: "Dead".into(),
978 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
979 signals: vec![Signal::StatusFound { codes: vec![200] }],
980 known_present: None,
981 known_absent: None,
982 extract: Vec::new(),
983 tags: Vec::new(),
984 request_headers: std::collections::BTreeMap::new(),
985 regex_check: None,
986 engine: None,
987 };
988 let client = Client::builder()
989 .timeout(Duration::from_millis(500))
990 .connect_timeout(Duration::from_millis(500))
991 .max_retries(0)
992 .build()
993 .unwrap();
994 let outcome = client.check(&site, &user()).await;
995 assert_eq!(outcome.kind, MatchKind::Uncertain);
996 assert!(outcome.reason.is_some());
997 }
998
999 #[tokio::test]
1000 async fn throttle_spaces_consecutive_calls_to_same_host() {
1001 let server = MockServer::start().await;
1002 Mock::given(method("GET"))
1003 .and(path("/alice"))
1004 .respond_with(ResponseTemplate::new(200))
1005 .mount(&server)
1006 .await;
1007 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1008 let client = Client::builder()
1013 .timeout(Duration::from_secs(2))
1014 .min_request_interval(Duration::from_millis(300))
1015 .build()
1016 .unwrap();
1017
1018 client.check(&site, &user()).await;
1019 let started = Instant::now();
1020 client.check(&site, &user()).await;
1021 let elapsed = started.elapsed();
1022 assert!(
1023 elapsed >= Duration::from_millis(200),
1024 "second probe to the same host should wait ≥200 ms, got {elapsed:?}",
1025 );
1026 }
1027
1028 #[tokio::test]
1029 async fn builder_overrides_user_agent() {
1030 let server = MockServer::start().await;
1031 Mock::given(method("GET"))
1032 .and(path("/alice"))
1033 .and(wiremock::matchers::header("user-agent", "adler-test/1.0"))
1034 .respond_with(ResponseTemplate::new(200))
1035 .mount(&server)
1036 .await;
1037 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1038 let client = Client::builder()
1039 .user_agent("adler-test/1.0")
1040 .build()
1041 .unwrap();
1042 let outcome = client.check(&site, &user()).await;
1043 assert_eq!(outcome.kind, MatchKind::Found);
1044 }
1045
1046 #[tokio::test]
1047 async fn rate_limit_429_yields_uncertain_with_note() {
1048 let server = MockServer::start().await;
1049 Mock::given(method("GET"))
1050 .and(path("/alice"))
1051 .respond_with(ResponseTemplate::new(429))
1052 .mount(&server)
1053 .await;
1054 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1055 let outcome = build_client().check(&site, &user()).await;
1056 assert_eq!(outcome.kind, MatchKind::Uncertain);
1057 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1058 }
1059
1060 #[tokio::test]
1061 async fn cloudflare_server_header_yields_uncertain() {
1062 let server = MockServer::start().await;
1063 Mock::given(method("GET"))
1064 .and(path("/alice"))
1065 .respond_with(ResponseTemplate::new(503).insert_header("server", "cloudflare"))
1066 .mount(&server)
1067 .await;
1068 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1069 let outcome = build_client().check(&site, &user()).await;
1070 assert_eq!(outcome.kind, MatchKind::Uncertain);
1071 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1072 }
1073
1074 #[tokio::test]
1075 async fn cloudflare_interstitial_in_body_yields_uncertain() {
1076 let server = MockServer::start().await;
1079 Mock::given(method("GET"))
1080 .and(path("/alice"))
1081 .respond_with(
1082 ResponseTemplate::new(200)
1083 .set_body_string("<html><head><title>Just a moment...</title></head></html>"),
1084 )
1085 .mount(&server)
1086 .await;
1087 let site = site_with(
1088 &server,
1089 vec![Signal::BodyAbsent {
1090 text: "Profile not found".into(),
1091 }],
1092 );
1093 let outcome = build_client().check(&site, &user()).await;
1094 assert_eq!(outcome.kind, MatchKind::Uncertain);
1095 assert_eq!(outcome.reason, Some(UncertainReason::CloudflareChallenge));
1096 }
1097
1098 #[tokio::test]
1099 async fn ban_detection_does_not_fire_on_legitimate_403() {
1100 let server = MockServer::start().await;
1101 Mock::given(method("GET"))
1102 .and(path("/alice"))
1103 .respond_with(ResponseTemplate::new(403))
1104 .mount(&server)
1105 .await;
1106 let site = site_with(
1107 &server,
1108 vec![
1109 Signal::StatusFound { codes: vec![200] },
1110 Signal::StatusNotFound { codes: vec![403] },
1111 ],
1112 );
1113 let outcome = build_client().check(&site, &user()).await;
1114 assert_eq!(outcome.kind, MatchKind::NotFound);
1116 assert!(outcome.reason.is_none());
1117 }
1118
1119 #[tokio::test]
1120 async fn retry_recovers_after_transient_429() {
1121 let server = MockServer::start().await;
1122 Mock::given(method("GET"))
1124 .and(path("/alice"))
1125 .respond_with(ResponseTemplate::new(429))
1126 .up_to_n_times(1)
1127 .mount(&server)
1128 .await;
1129 Mock::given(method("GET"))
1130 .and(path("/alice"))
1131 .respond_with(ResponseTemplate::new(200))
1132 .mount(&server)
1133 .await;
1134 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1135 let client = Client::builder()
1136 .timeout(Duration::from_secs(2))
1137 .min_request_interval(Duration::ZERO)
1138 .max_retries(2)
1139 .base_backoff_delay(Duration::from_millis(20))
1140 .max_backoff_delay(Duration::from_millis(100))
1141 .build()
1142 .unwrap();
1143 let outcome = client.check(&site, &user()).await;
1144 assert_eq!(outcome.kind, MatchKind::Found);
1145 assert!(outcome.reason.is_none());
1146 }
1147
1148 #[tokio::test]
1149 async fn retry_exhausts_and_returns_uncertain() {
1150 let server = MockServer::start().await;
1151 Mock::given(method("GET"))
1152 .and(path("/alice"))
1153 .respond_with(ResponseTemplate::new(429))
1154 .mount(&server)
1155 .await;
1156 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1157 let client = Client::builder()
1158 .timeout(Duration::from_secs(2))
1159 .min_request_interval(Duration::ZERO)
1160 .max_retries(2)
1161 .base_backoff_delay(Duration::from_millis(10))
1162 .max_backoff_delay(Duration::from_millis(50))
1163 .build()
1164 .unwrap();
1165 let outcome = client.check(&site, &user()).await;
1166 assert_eq!(outcome.kind, MatchKind::Uncertain);
1167 assert_eq!(outcome.reason, Some(UncertainReason::RateLimited));
1168 }
1169
1170 #[tokio::test]
1171 async fn retry_does_not_fire_on_network_error() {
1172 let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap();
1176 let port = listener.local_addr().unwrap().port();
1177 drop(listener);
1178 let site = Site {
1179 name: "Dead".into(),
1180 url: UrlTemplate::new(format!("http://127.0.0.1:{port}/{{username}}")).unwrap(),
1181 signals: vec![Signal::StatusFound { codes: vec![200] }],
1182 known_present: None,
1183 known_absent: None,
1184 extract: Vec::new(),
1185 tags: Vec::new(),
1186 request_headers: std::collections::BTreeMap::new(),
1187 regex_check: None,
1188 engine: None,
1189 };
1190 let client = Client::builder()
1191 .timeout(Duration::from_millis(500))
1192 .connect_timeout(Duration::from_millis(500))
1193 .min_request_interval(Duration::ZERO)
1194 .max_retries(3)
1195 .base_backoff_delay(Duration::from_secs(60))
1196 .build()
1197 .unwrap();
1198 let started = Instant::now();
1199 let outcome = client.check(&site, &user()).await;
1200 assert!(started.elapsed() < Duration::from_secs(5));
1203 assert_eq!(outcome.kind, MatchKind::Uncertain);
1204 assert!(
1205 matches!(outcome.reason, Some(UncertainReason::Network(_))),
1206 "got {:?}",
1207 outcome.reason,
1208 );
1209 }
1210
1211 #[tokio::test]
1212 async fn rotates_user_agent_per_request() {
1213 let server = MockServer::start().await;
1217 Mock::given(method("GET"))
1218 .and(path("/alice"))
1219 .and(wiremock::matchers::header("user-agent", "RotatorUA/9.9"))
1220 .respond_with(ResponseTemplate::new(200))
1221 .mount(&server)
1222 .await;
1223 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1224 let client = Client::builder()
1225 .min_request_interval(Duration::ZERO)
1226 .max_retries(0)
1227 .rotate_user_agents(vec!["RotatorUA/9.9".into()])
1228 .build()
1229 .unwrap();
1230 let outcome = client.check(&site, &user()).await;
1231 assert_eq!(outcome.kind, MatchKind::Found);
1232 }
1233
1234 #[test]
1235 fn invalid_proxy_url_fails_build() {
1236 let err = Client::builder().proxy("not a url").build().unwrap_err();
1237 assert!(matches!(err, Error::HttpSetup { .. }));
1238 }
1239
1240 #[test]
1241 fn schemeless_proxy_is_rejected_up_front() {
1242 let err = Client::builder().proxy("not-a-url").build().unwrap_err();
1244 let Error::HttpSetup { message } = err else {
1245 panic!("expected HttpSetup, got {err:?}");
1246 };
1247 assert!(message.contains("must start with"), "{message}");
1248 }
1249
1250 #[test]
1251 fn socks5_proxy_scheme_is_accepted() {
1252 assert!(
1254 Client::builder()
1255 .proxy("socks5://127.0.0.1:9050")
1256 .build()
1257 .is_ok()
1258 );
1259 }
1260
1261 #[tokio::test]
1262 async fn global_rps_cap_spaces_requests_across_hosts() {
1263 let server = MockServer::start().await;
1266 Mock::given(method("GET"))
1267 .respond_with(ResponseTemplate::new(200))
1268 .mount(&server)
1269 .await;
1270 let site_a = Site {
1271 name: "A".into(),
1272 url: UrlTemplate::new(format!("{}/a/{{username}}", server.uri())).unwrap(),
1273 signals: vec![Signal::StatusFound { codes: vec![200] }],
1274 known_present: None,
1275 known_absent: None,
1276 extract: Vec::new(),
1277 tags: Vec::new(),
1278 request_headers: std::collections::BTreeMap::new(),
1279 regex_check: None,
1280 engine: None,
1281 };
1282 let site_b = Site {
1283 name: "B".into(),
1284 url: UrlTemplate::new(format!("{}/b/{{username}}", server.uri())).unwrap(),
1285 signals: vec![Signal::StatusFound { codes: vec![200] }],
1286 known_present: None,
1287 known_absent: None,
1288 extract: Vec::new(),
1289 tags: Vec::new(),
1290 request_headers: std::collections::BTreeMap::new(),
1291 regex_check: None,
1292 engine: None,
1293 };
1294 let client = Client::builder()
1299 .min_request_interval(Duration::ZERO)
1300 .max_retries(0)
1301 .max_rps(std::num::NonZeroU32::new(2).unwrap())
1302 .build()
1303 .unwrap();
1304 client.check(&site_a, &user()).await;
1307 let started = Instant::now();
1308 client.check(&site_b, &user()).await;
1309 assert!(
1310 started.elapsed() >= Duration::from_millis(350),
1311 "global cap should space cross-host requests, got {:?}",
1312 started.elapsed(),
1313 );
1314 }
1315
1316 #[tokio::test]
1317 async fn respect_robots_skips_disallowed_paths() {
1318 let server = MockServer::start().await;
1319 Mock::given(method("GET"))
1320 .and(path("/robots.txt"))
1321 .respond_with(
1322 ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /no"),
1323 )
1324 .mount(&server)
1325 .await;
1326 Mock::given(method("GET"))
1327 .and(path("/no/alice"))
1328 .respond_with(ResponseTemplate::new(200))
1329 .mount(&server)
1330 .await;
1331 Mock::given(method("GET"))
1332 .and(path("/yes/alice"))
1333 .respond_with(ResponseTemplate::new(200))
1334 .mount(&server)
1335 .await;
1336 let client = Client::builder()
1337 .min_request_interval(Duration::ZERO)
1338 .max_retries(0)
1339 .respect_robots(true)
1340 .build()
1341 .unwrap();
1342
1343 let disallowed = Site {
1344 name: "No".into(),
1345 url: UrlTemplate::new(format!("{}/no/{{username}}", server.uri())).unwrap(),
1346 signals: vec![Signal::StatusFound { codes: vec![200] }],
1347 known_present: None,
1348 known_absent: None,
1349 extract: Vec::new(),
1350 tags: Vec::new(),
1351 request_headers: std::collections::BTreeMap::new(),
1352 regex_check: None,
1353 engine: None,
1354 };
1355 let allowed = Site {
1356 name: "Yes".into(),
1357 url: UrlTemplate::new(format!("{}/yes/{{username}}", server.uri())).unwrap(),
1358 signals: vec![Signal::StatusFound { codes: vec![200] }],
1359 known_present: None,
1360 known_absent: None,
1361 extract: Vec::new(),
1362 tags: Vec::new(),
1363 request_headers: std::collections::BTreeMap::new(),
1364 regex_check: None,
1365 engine: None,
1366 };
1367
1368 let no = client.check(&disallowed, &user()).await;
1369 assert_eq!(no.kind, MatchKind::Uncertain);
1370 assert_eq!(no.reason, Some(UncertainReason::RobotsDisallowed));
1371
1372 let yes = client.check(&allowed, &user()).await;
1373 assert_eq!(yes.kind, MatchKind::Found);
1374 }
1375
1376 #[tokio::test]
1377 async fn body_read_skipped_when_no_body_signal_needed() {
1378 let server = MockServer::start().await;
1381 Mock::given(method("GET"))
1382 .and(path("/alice"))
1383 .respond_with(ResponseTemplate::new(200).set_body_string("Profile not found"))
1384 .mount(&server)
1385 .await;
1386 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1387 let outcome = build_client().check(&site, &user()).await;
1388 assert_eq!(outcome.kind, MatchKind::Found);
1389 }
1390
1391 #[derive(Debug)]
1397 struct RecordingBackend {
1398 page: RenderedPage,
1399 calls: std::sync::atomic::AtomicUsize,
1400 }
1401
1402 impl RecordingBackend {
1403 fn with_page(page: RenderedPage) -> Self {
1404 Self {
1405 page,
1406 calls: std::sync::atomic::AtomicUsize::new(0),
1407 }
1408 }
1409 fn call_count(&self) -> usize {
1410 self.calls.load(std::sync::atomic::Ordering::SeqCst)
1411 }
1412 }
1413
1414 #[async_trait::async_trait]
1415 impl BrowserBackend for RecordingBackend {
1416 async fn fetch(
1417 &self,
1418 _url: &url::Url,
1419 _headers: &std::collections::BTreeMap<String, String>,
1420 _timeout: Duration,
1421 ) -> Result<RenderedPage> {
1422 self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
1423 Ok(self.page.clone())
1424 }
1425 }
1426
1427 fn site_bot_protected(server: &MockServer) -> Site {
1428 let mut s = site_with(server, vec![Signal::StatusFound { codes: vec![200] }]);
1429 s.tags = vec!["bot-protected".into()];
1430 s
1431 }
1432
1433 #[tokio::test]
1434 async fn browser_routes_bot_protected_sites() {
1435 let server = MockServer::start().await;
1438 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1439 status: 200,
1440 final_url: url::Url::parse("https://example.com/alice").unwrap(),
1441 body: "<html></html>".into(),
1442 elapsed_ms: 42,
1443 }));
1444 let client = Client::builder()
1445 .min_request_interval(Duration::ZERO)
1446 .max_retries(0)
1447 .browser(backend.clone())
1448 .build()
1449 .unwrap();
1450 let outcome = client.check(&site_bot_protected(&server), &user()).await;
1451 assert_eq!(outcome.kind, MatchKind::Found);
1452 assert_eq!(backend.call_count(), 1, "browser invoked exactly once");
1453 }
1454
1455 #[tokio::test]
1456 async fn non_bot_protected_sites_skip_browser() {
1457 let server = MockServer::start().await;
1458 Mock::given(method("GET"))
1459 .and(path("/alice"))
1460 .respond_with(ResponseTemplate::new(200))
1461 .mount(&server)
1462 .await;
1463 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1464 status: 500, final_url: url::Url::parse("https://x/").unwrap(),
1466 body: String::new(),
1467 elapsed_ms: 0,
1468 }));
1469 let client = Client::builder()
1470 .min_request_interval(Duration::ZERO)
1471 .max_retries(0)
1472 .browser(backend.clone())
1473 .build()
1474 .unwrap();
1475 let site = site_with(&server, vec![Signal::StatusFound { codes: vec![200] }]);
1477 let outcome = client.check(&site, &user()).await;
1478 assert_eq!(outcome.kind, MatchKind::Found);
1479 assert_eq!(backend.call_count(), 0, "browser must not be touched");
1480 }
1481
1482 #[tokio::test]
1483 async fn browser_budget_exhaust_yields_uncertain() {
1484 let server = MockServer::start().await;
1485 let backend = Arc::new(RecordingBackend::with_page(RenderedPage {
1486 status: 200,
1487 final_url: url::Url::parse("https://x/").unwrap(),
1488 body: String::new(),
1489 elapsed_ms: 0,
1490 }));
1491 let client = Client::builder()
1492 .min_request_interval(Duration::ZERO)
1493 .max_retries(0)
1494 .browser(backend.clone())
1495 .browser_budget(1)
1496 .build()
1497 .unwrap();
1498 let site = site_bot_protected(&server);
1499 let first = client.check(&site, &user()).await;
1501 assert_eq!(first.kind, MatchKind::Found);
1502 let second = client.check(&site, &user()).await;
1504 assert_eq!(second.kind, MatchKind::Uncertain);
1505 assert!(matches!(
1506 second.reason,
1507 Some(UncertainReason::BrowserBudget)
1508 ));
1509 assert_eq!(
1510 backend.call_count(),
1511 1,
1512 "second call must not invoke backend"
1513 );
1514 }
1515
1516 #[tokio::test]
1517 async fn browser_failure_surfaces_as_uncertain_browser_failed() {
1518 struct FailingBackend;
1519 #[async_trait::async_trait]
1520 impl BrowserBackend for FailingBackend {
1521 async fn fetch(
1522 &self,
1523 _url: &url::Url,
1524 _headers: &std::collections::BTreeMap<String, String>,
1525 _timeout: Duration,
1526 ) -> Result<RenderedPage> {
1527 Err(Error::BrowserSetup {
1528 message: "simulated crash".into(),
1529 })
1530 }
1531 }
1532 impl std::fmt::Debug for FailingBackend {
1533 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1534 f.write_str("FailingBackend")
1535 }
1536 }
1537
1538 let server = MockServer::start().await;
1539 let client = Client::builder()
1540 .min_request_interval(Duration::ZERO)
1541 .max_retries(0)
1542 .browser(Arc::new(FailingBackend))
1543 .build()
1544 .unwrap();
1545 let outcome = client.check(&site_bot_protected(&server), &user()).await;
1546 assert_eq!(outcome.kind, MatchKind::Uncertain);
1547 match outcome.reason {
1548 Some(UncertainReason::BrowserFailed(msg)) => {
1549 assert!(msg.contains("simulated crash"), "got: {msg}");
1550 }
1551 other => panic!("expected BrowserFailed, got {other:?}"),
1552 }
1553 }
1554}