use std::fmt;
use std::num::NonZeroU32;
use std::sync::Arc;
use std::time::Duration;
use reqwest::redirect;
use crate::access::{EgressPool, EgressSpec, SessionStore};
use crate::browser::{BrowserBackend, BrowserBudget};
use crate::error::{Error, Result};
use crate::retry::RetryPolicy;
use crate::robots::RobotsCache;
use crate::throttle::HostThrottle;
use crate::transport::HttpFetcher;
#[cfg(feature = "impersonate")]
use crate::transport::ImpersonateFetcher;
use super::util::default_user_agent;
use super::{
Client, DEFAULT_CONNECT_TIMEOUT, DEFAULT_PER_HOST_INTERVAL, DEFAULT_REDIRECT_LIMIT,
DEFAULT_TIMEOUT,
};
#[derive(Clone)]
#[must_use = "ClientBuilder does nothing until `.build()` is called"]
#[allow(clippy::struct_excessive_bools)]
pub struct ClientBuilder {
timeout: Duration,
connect_timeout: Duration,
user_agent: String,
follow_redirects: bool,
redirect_limit: usize,
min_request_interval: Duration,
max_rps: Option<NonZeroU32>,
retry: RetryPolicy,
proxy: Option<String>,
user_agents: Vec<String>,
enrich: bool,
respect_robots: bool,
browser: Option<Arc<dyn BrowserBackend>>,
browser_budget: usize,
egress: Vec<EgressSpec>,
sessions: SessionStore,
escalation_budget: usize,
escalation_enabled: bool,
}
impl Default for ClientBuilder {
fn default() -> Self {
Self {
timeout: DEFAULT_TIMEOUT,
connect_timeout: DEFAULT_CONNECT_TIMEOUT,
user_agent: default_user_agent(),
follow_redirects: true,
redirect_limit: DEFAULT_REDIRECT_LIMIT,
min_request_interval: DEFAULT_PER_HOST_INTERVAL,
max_rps: None,
retry: RetryPolicy::default(),
proxy: None,
user_agents: Vec::new(),
enrich: false,
respect_robots: false,
browser: None,
browser_budget: DEFAULT_BROWSER_BUDGET,
egress: Vec::new(),
sessions: SessionStore::new(),
escalation_budget: DEFAULT_ESCALATION_BUDGET,
escalation_enabled: true,
}
}
}
impl ClientBuilder {
pub fn timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
pub fn connect_timeout(mut self, timeout: Duration) -> Self {
self.connect_timeout = timeout;
self
}
pub fn user_agent(mut self, user_agent: impl Into<String>) -> Self {
self.user_agent = user_agent.into();
self
}
pub fn follow_redirects(mut self, follow: bool) -> Self {
self.follow_redirects = follow;
self
}
pub fn min_request_interval(mut self, interval: Duration) -> Self {
self.min_request_interval = interval;
self
}
pub fn max_rps(mut self, rps: NonZeroU32) -> Self {
self.max_rps = Some(rps);
self
}
pub fn max_retries(mut self, n: u32) -> Self {
self.retry.max_retries = n;
self
}
pub fn base_backoff_delay(mut self, d: Duration) -> Self {
self.retry.base_delay = d;
self
}
pub fn max_backoff_delay(mut self, d: Duration) -> Self {
self.retry.max_delay = d;
self
}
pub fn proxy(mut self, url: impl Into<String>) -> Self {
self.proxy = Some(url.into());
self
}
pub fn rotate_user_agents(mut self, agents: Vec<String>) -> Self {
self.user_agents = agents;
self
}
pub fn enrich(mut self, enrich: bool) -> Self {
self.enrich = enrich;
self
}
pub fn respect_robots(mut self, respect: bool) -> Self {
self.respect_robots = respect;
self
}
pub fn browser(mut self, backend: Arc<dyn BrowserBackend>) -> Self {
self.browser = Some(backend);
self
}
pub const fn browser_budget(mut self, cap: usize) -> Self {
self.browser_budget = cap;
self
}
pub const fn escalation_budget(mut self, cap: usize) -> Self {
self.escalation_budget = cap;
self
}
pub const fn disable_escalation(mut self) -> Self {
self.escalation_enabled = false;
self
}
pub fn egress_pool(mut self, egress: Vec<EgressSpec>) -> Self {
self.egress = egress;
self
}
pub fn sessions(mut self, sessions: SessionStore) -> Self {
self.sessions = sessions;
self
}
pub fn build(self) -> Result<Client> {
let inner = build_reqwest(
&self.user_agent,
self.timeout,
self.connect_timeout,
self.follow_redirects,
self.redirect_limit,
self.proxy.as_deref(),
)?;
let mut egress_entries = Vec::with_capacity(self.egress.len());
for spec in &self.egress {
let client = build_reqwest(
&self.user_agent,
self.timeout,
self.connect_timeout,
self.follow_redirects,
self.redirect_limit,
Some(&spec.url),
)?;
egress_entries.push((
spec.name.clone(),
spec.country.clone(),
spec.kind,
Arc::new(HttpFetcher::new(client)),
));
}
let global_throttle = self.max_rps.map(|rps| {
let interval = Duration::from_secs(1) / rps.get();
HostThrottle::new(interval)
});
let robots = self
.respect_robots
.then(|| RobotsCache::new(inner.clone(), "adler"));
#[cfg(feature = "impersonate")]
let impersonate = Some(Arc::new(ImpersonateFetcher::new()?));
Ok(Client {
http: Arc::new(HttpFetcher::new(inner)),
egress: Arc::new(EgressPool::new(egress_entries)),
sessions: Arc::new(self.sessions),
throttle: HostThrottle::new(self.min_request_interval),
global_throttle,
retry: self.retry,
user_agents: Arc::from(self.user_agents),
enrich: self.enrich,
robots,
browser: self.browser,
browser_budget: Arc::new(BrowserBudget::new(self.browser_budget)),
escalation_budget: Arc::new(crate::escalation::EscalationBudget::new(
self.escalation_budget,
)),
escalation_enabled: self.escalation_enabled,
#[cfg(feature = "impersonate")]
impersonate,
})
}
}
fn build_reqwest(
user_agent: &str,
timeout: Duration,
connect_timeout: Duration,
follow_redirects: bool,
redirect_limit: usize,
proxy: Option<&str>,
) -> Result<reqwest::Client> {
let redirect_policy = if follow_redirects {
redirect::Policy::limited(redirect_limit)
} else {
redirect::Policy::none()
};
let mut builder = reqwest::Client::builder()
.user_agent(user_agent.to_owned())
.timeout(timeout)
.connect_timeout(connect_timeout)
.redirect(redirect_policy);
if let Some(proxy_url) = proxy {
const SCHEMES: [&str; 4] = ["http://", "https://", "socks5://", "socks5h://"];
if !SCHEMES.iter().any(|s| proxy_url.starts_with(s)) {
return Err(Error::HttpSetup {
message: format!(
"invalid proxy {proxy_url:?}: must start with one of {}",
SCHEMES.join(", ")
),
});
}
let proxy = reqwest::Proxy::all(proxy_url).map_err(|e| Error::HttpSetup {
message: format!("invalid proxy {proxy_url:?}: {e}"),
})?;
builder = builder.proxy(proxy);
}
builder.build().map_err(|e| Error::HttpSetup {
message: e.to_string(),
})
}
pub const DEFAULT_BROWSER_BUDGET: usize = 50;
pub const DEFAULT_ESCALATION_BUDGET: usize = 30;
impl fmt::Debug for ClientBuilder {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ClientBuilder")
.field("timeout", &self.timeout)
.field("connect_timeout", &self.connect_timeout)
.field("user_agent", &self.user_agent)
.field("follow_redirects", &self.follow_redirects)
.field("redirect_limit", &self.redirect_limit)
.field("min_request_interval", &self.min_request_interval)
.field("max_rps", &self.max_rps)
.field("retry", &self.retry)
.field("proxy", &self.proxy)
.field("user_agents", &self.user_agents)
.field("enrich", &self.enrich)
.field("respect_robots", &self.respect_robots)
.field("browser", &self.browser.is_some())
.field("browser_budget", &self.browser_budget)
.field("egress", &self.egress)
.field("sessions", &self.sessions)
.field("escalation_budget", &self.escalation_budget)
.field("escalation_enabled", &self.escalation_enabled)
.finish()
}
}