1use std::borrow::Cow;
17use std::collections::BTreeMap;
18use std::sync::Arc;
19use std::time::{Instant, SystemTime, UNIX_EPOCH};
20
21use crate::access::EgressChoice;
22use crate::check::{CheckOutcome, MatchKind, UncertainReason};
23use crate::escalation::TransportTier;
24use crate::retry;
25use crate::site::{HttpMethod, Probe, ProtectionKind, Signal, SignalVerdict, Site, aggregate};
26use crate::transport::{
27 BROWSER_TIMEOUT, BrowserFetcher, FetchError, FetchRequest, Fetcher, HttpFetcher,
28};
29use crate::username::Username;
30
31use super::util::{host_of, origin_and_path, outcome, uncertain};
32use super::{BOT_PROTECTED_TAG, Client, GLOBAL_THROTTLE_KEY, RawResponse};
33
34fn routes_through_browser(site: &Site) -> bool {
35 site.tags
36 .iter()
37 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG))
38 || site
39 .protection
40 .iter()
41 .any(|p| !matches!(p, ProtectionKind::UserAuth))
42}
43
44#[derive(Debug, Clone, Copy)]
45struct ProbeEvidenceContext {
46 transport: TransportTier,
47 escalations: u8,
48 authenticated: bool,
49}
50
51impl Client {
52 #[tracing::instrument(skip(self), fields(site = %site.name, user = %username))]
66 pub async fn check(&self, site: &Site, username: &Username) -> CheckOutcome {
67 let mut attempt: u32 = 0;
68 loop {
69 let outcome = self.probe_once(site, username).await;
70 if !retry::should_retry(&outcome, attempt, &self.retry) {
71 return outcome;
72 }
73 let delay = retry::backoff_delay(attempt, &self.retry);
74 tracing::info!(
75 site = %site.name,
76 attempt = attempt + 1,
77 reason = outcome.reason.as_ref().map(ToString::to_string).unwrap_or_default(),
78 ?delay,
79 "transient ban, retrying",
80 );
81 tokio::time::sleep(delay).await;
82 attempt += 1;
83 }
84 }
85
86 pub async fn fetch(&self, url: &str) -> Option<RawResponse> {
95 let host = host_of(url);
96 if let Some(global) = &self.global_throttle {
97 global.wait(GLOBAL_THROTTLE_KEY).await;
98 }
99 self.throttle.wait(&host).await;
100 let mut request = self.http.client().get(url);
101 if let Some(ua) = self.pick_user_agent() {
102 request = request.header(reqwest::header::USER_AGENT, ua);
103 }
104 let response = request.send().await.ok()?;
105 let status = response.status().as_u16();
106 let final_url = response.url().to_string();
107 let body = response.text().await.unwrap_or_default();
108 Some(RawResponse {
109 status,
110 final_url,
111 body,
112 })
113 }
114
115 pub async fn fetch_for_doctor(&self, site: &Site, url: &str) -> Option<RawResponse> {
126 if let Some(backend) = self.browser.as_deref() {
127 if routes_through_browser(site) {
128 let parsed = url::Url::parse(url).ok()?;
129 match backend
130 .fetch(&parsed, &site.request_headers, BROWSER_TIMEOUT)
131 .await
132 {
133 Ok(page) => {
134 return Some(RawResponse {
135 status: page.status,
136 final_url: page.final_url.to_string(),
137 body: page.body,
138 });
139 }
140 Err(err) => {
141 tracing::warn!(
142 site = %site.name, %url, error = %err,
143 "browser fetch failed in doctor; falling back to raw HTTP",
144 );
145 }
146 }
147 }
148 }
149 self.fetch(url).await
150 }
151
152 fn pick_user_agent(&self) -> Option<&str> {
155 match self.user_agents.len() {
156 0 => None,
157 1 => Some(&self.user_agents[0]),
158 n => Some(&self.user_agents[fastrand::usize(0..n)]),
159 }
160 }
161
162 #[allow(clippy::too_many_lines)]
165 async fn probe_once(&self, site: &Site, username: &Username) -> CheckOutcome {
166 let url = site.url_for(username);
167
168 if let Some(pat) = &site.regex_check {
178 if let Ok(re) = regex::Regex::new(pat) {
179 if !re.is_match(username.as_str()) {
180 return uncertain(
181 &site.name,
182 url,
183 Instant::now(),
184 UncertainReason::UsernameNotAllowed,
185 );
186 }
187 }
188 }
189
190 let (session_headers, authenticated): (Cow<'_, BTreeMap<String, String>>, bool) =
197 match &site.access.session {
198 None => (Cow::Borrowed(&site.request_headers), false),
199 Some(name) => match self.sessions.get(name) {
200 Some(session) => (Cow::Owned(session.apply(&site.request_headers)), true),
201 None => {
202 return uncertain(
203 &site.name,
204 url,
205 Instant::now(),
206 UncertainReason::SessionRequired,
207 );
208 }
209 },
210 };
211 let headers: &BTreeMap<String, String> = &session_headers;
212
213 if let Some(backend) = &self.browser {
220 if routes_through_browser(site) {
221 if self.browser_budget.try_consume() {
222 let started = Instant::now();
223 let req = FetchRequest {
224 method: site.request_method,
225 url: &url,
226 body: None,
227 user_agent: None,
228 headers,
229 want_body: true,
230 };
231 let fetcher = BrowserFetcher::new(Arc::clone(backend));
232 let mut outcome = match fetcher.fetch(&req).await {
233 Ok(resp) => self.finish(
234 site,
235 username,
236 url,
237 started,
238 &resp,
239 ProbeEvidenceContext {
240 transport: TransportTier::Browser,
241 escalations: 0,
242 authenticated,
243 },
244 ),
245 Err(FetchError(reason)) => uncertain(&site.name, url, started, reason),
246 };
247 outcome.transport = Some(TransportTier::Browser);
248 return outcome;
249 }
250 tracing::warn!(site = %site.name, "browser budget exhausted");
251 let mut outcome = uncertain(
252 &site.name,
253 url,
254 Instant::now(),
255 UncertainReason::BrowserBudget,
256 );
257 outcome.transport = Some(TransportTier::Browser);
258 return outcome;
259 }
260 }
261
262 #[cfg(feature = "impersonate")]
269 if let Some(fetcher) = &self.impersonate {
270 let pure_tls = site.protection.len() == 1
271 && site.protection[0] == crate::site::ProtectionKind::TlsFingerprint
272 && !site
273 .tags
274 .iter()
275 .any(|t| t.eq_ignore_ascii_case(BOT_PROTECTED_TAG));
276 if pure_tls {
277 let started = Instant::now();
278 let req = FetchRequest {
279 method: site.request_method,
280 url: &url,
281 body: None,
282 user_agent: self.pick_user_agent(),
283 headers,
284 want_body: true,
285 };
286 let mut primary = match fetcher.fetch(&req).await {
287 Ok(resp) => self.finish(
288 site,
289 username,
290 url.clone(),
291 started,
292 &resp,
293 ProbeEvidenceContext {
294 transport: TransportTier::Impersonate,
295 escalations: 0,
296 authenticated,
297 },
298 ),
299 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
300 };
301 primary.transport = Some(TransportTier::Impersonate);
302 return self
303 .maybe_escalate(site, username, &url, headers, authenticated, primary)
304 .await;
305 }
306 }
307
308 let egress: Arc<HttpFetcher> = match self.egress.select(&site.access) {
315 EgressChoice::Default => Arc::clone(&self.http),
316 EgressChoice::Use(fetcher) => fetcher,
317 EgressChoice::Unavailable => {
318 return uncertain(
319 &site.name,
320 url,
321 Instant::now(),
322 UncertainReason::GeoUnavailable,
323 );
324 }
325 };
326
327 let host = host_of(&url);
328
329 if let Some(robots) = &self.robots {
331 if let Some((origin, path)) = origin_and_path(&url) {
332 if !robots.allowed(&origin, &path).await {
333 tracing::debug!(%url, "skipped by robots.txt");
334 return uncertain(
335 &site.name,
336 url,
337 Instant::now(),
338 UncertainReason::RobotsDisallowed,
339 );
340 }
341 }
342 }
343
344 if let Some(global) = &self.global_throttle {
346 global.wait(GLOBAL_THROTTLE_KEY).await;
347 }
348 self.throttle.wait(&host).await;
349 let started = Instant::now();
350 tracing::debug!(%url, %host, "probing");
351
352 let want_enrich = self.enrich && !site.extract.is_empty();
355 let needs_body = want_enrich || site.signals.iter().any(crate::site::Signal::needs_body);
356
357 let body_for_post: Option<String> = if matches!(site.request_method, HttpMethod::Post) {
362 const USERNAME_PH: &str = "{username}";
363 site.request_body
364 .as_deref()
365 .map(|t| t.replace(USERNAME_PH, username.as_str()))
366 } else {
367 None
368 };
369
370 let req = FetchRequest {
371 method: site.request_method,
372 url: &url,
373 body: body_for_post.as_deref(),
374 user_agent: self.pick_user_agent(),
375 headers,
376 want_body: needs_body,
377 };
378 let mut primary = match egress.fetch(&req).await {
379 Ok(resp) => self.finish(
380 site,
381 username,
382 url.clone(),
383 started,
384 &resp,
385 ProbeEvidenceContext {
386 transport: TransportTier::Http,
387 escalations: 0,
388 authenticated,
389 },
390 ),
391 Err(FetchError(reason)) => uncertain(&site.name, url.clone(), started, reason),
392 };
393 primary.transport = Some(TransportTier::Http);
394 self.maybe_escalate(site, username, &url, headers, authenticated, primary)
395 .await
396 }
397
398 async fn maybe_escalate(
403 &self,
404 site: &Site,
405 username: &Username,
406 url: &str,
407 headers: &BTreeMap<String, String>,
408 authenticated: bool,
409 primary: CheckOutcome,
410 ) -> CheckOutcome {
411 if !self.escalation_enabled || primary.kind != MatchKind::Uncertain {
412 return primary;
413 }
414 let Some(reason) = &primary.reason else {
415 return primary;
416 };
417 if !crate::escalation::should_escalate(reason) {
418 return primary;
419 }
420 let Some(backend) = &self.browser else {
421 return primary;
422 };
423 if !self.escalation_budget.try_consume() {
424 tracing::debug!(site = %site.name, "escalation budget exhausted");
425 return primary;
426 }
427
428 tracing::debug!(site = %site.name, reason = %reason, "escalating to browser");
429 let started = Instant::now();
430 let req = FetchRequest {
431 method: site.request_method,
432 url,
433 body: None,
434 user_agent: None,
435 headers,
436 want_body: true,
437 };
438 let fetcher = BrowserFetcher::new(Arc::clone(backend));
439 let mut escalated = match fetcher.fetch(&req).await {
440 Ok(resp) => self.finish(
441 site,
442 username,
443 url.to_owned(),
444 started,
445 &resp,
446 ProbeEvidenceContext {
447 transport: TransportTier::Browser,
448 escalations: 1,
449 authenticated,
450 },
451 ),
452 Err(FetchError(r)) => uncertain(&site.name, url.to_owned(), started, r),
453 };
454 escalated.transport = Some(TransportTier::Browser);
455 escalated.escalations = 1;
456 escalated
457 }
458
459 fn finish(
463 &self,
464 site: &Site,
465 username: &Username,
466 url: String,
467 started: Instant,
468 resp: &crate::transport::FetchResponse,
469 context: ProbeEvidenceContext,
470 ) -> CheckOutcome {
471 let canonical_username = site.canonical_username(username);
472 let probe = Probe {
473 status: resp.status,
474 final_url: &resp.final_url,
475 body: &resp.body,
476 username: &canonical_username,
477 };
478 let votes: Vec<(&Signal, SignalVerdict)> = site
479 .signals
480 .iter()
481 .map(|s| (s, s.evaluate(&probe)))
482 .collect();
483 let kind = aggregate(votes.iter().map(|(_, v)| *v));
484 let mut result = outcome(&site.name, url, started, kind);
485 result.transport = Some(context.transport);
486 result.escalations = context.escalations;
487 let winning = match kind {
489 MatchKind::Found => Some(SignalVerdict::Found),
490 MatchKind::NotFound => Some(SignalVerdict::NotFound),
491 MatchKind::Uncertain => None,
492 };
493 if let Some(want) = winning {
494 result.evidence = votes
495 .iter()
496 .filter(|(_, v)| *v == want)
497 .map(|(s, _)| s.describe_match(&probe))
498 .collect();
499 }
500 let username_confirmed = kind == MatchKind::Found
501 && votes
502 .iter()
503 .any(|(s, v)| *v == SignalVerdict::Found && s.confirms_username());
504 if username_confirmed {
505 let observed_at_ms = unix_epoch_ms();
506 let access_path = crate::EvidenceAccessPath::new(
507 context.transport,
508 context.escalations,
509 context.authenticated,
510 );
511 result
512 .profile_evidence
513 .push(crate::ProfileEvidence::from_signal_username(
514 &result.site,
515 &result.url,
516 &canonical_username,
517 Some(observed_at_ms),
518 Some(access_path),
519 ));
520 }
521 if self.enrich && kind == MatchKind::Found && !site.extract.is_empty() {
522 result.enrichment = crate::enrich::extract(&resp.body, &site.extract);
523 let observed_at_ms = unix_epoch_ms();
524 let access_path = crate::EvidenceAccessPath::new(
525 context.transport,
526 context.escalations,
527 context.authenticated,
528 );
529 result.profile_evidence = result
530 .enrichment
531 .iter()
532 .map(|(field, value)| {
533 crate::ProfileEvidence::from_enrichment_with_source(
534 &result.site,
535 &result.url,
536 field,
537 value,
538 Some(observed_at_ms),
539 Some(access_path.clone()),
540 )
541 })
542 .collect();
543 }
544 result.refresh_confidence();
545 result
546 }
547}
548
549fn unix_epoch_ms() -> u64 {
550 SystemTime::now()
551 .duration_since(UNIX_EPOCH)
552 .ok()
553 .and_then(|duration| u64::try_from(duration.as_millis()).ok())
554 .unwrap_or(u64::MAX)
555}