scrapling_fetch/proxy.rs
1//! Proxy configuration and rotation for HTTP requests.
2//!
3//! When scraping at scale, routing requests through proxy servers helps avoid IP-based
4//! rate limiting and bans. This module provides two mechanisms:
5//!
6//! - **Static proxy** -- A single [`Proxy`] set on [`FetcherConfig`](crate::FetcherConfig)
7//! that is used for every request.
8//! - **Proxy rotation** -- A [`ProxyRotator`] that cycles through a pool of proxies,
9//! picking the next one for each request according to a [`RotationStrategy`].
10//!
11//! The [`is_proxy_error`] helper function inspects error messages to determine whether
12//! a failure was proxy-related, which is useful for deciding whether to retry with a
13//! different proxy.
14
15use std::sync::Mutex;
16
17use serde::{Deserialize, Serialize};
18
19use crate::error::FetchError;
20
21/// A proxy server specification, either as a URL string or a structured configuration.
22///
23/// The two variants exist for serialization convenience: simple proxies can be written
24/// as a plain URL string in config files, while proxies with credentials use the
25/// structured `Config` variant with explicit fields.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27#[serde(untagged)]
28pub enum Proxy {
29 /// A proxy specified as a plain URL string, e.g. `"http://proxy.example.com:8080"`.
30 /// If the proxy requires authentication, embed credentials in the URL.
31 Url(String),
32 /// A proxy specified with explicit server, username, and password fields. This is
33 /// cleaner than embedding credentials in the URL and works well in config files.
34 Config {
35 /// The proxy server address (e.g., `"http://proxy.example.com:8080"`).
36 server: String,
37 /// Optional authentication username.
38 #[serde(default)]
39 username: Option<String>,
40 /// Optional authentication password.
41 #[serde(default)]
42 password: Option<String>,
43 },
44}
45
46impl Proxy {
47 fn key(&self) -> String {
48 match self {
49 Self::Url(url) => url.clone(),
50 Self::Config {
51 server, username, ..
52 } => {
53 let user = username.as_deref().unwrap_or("");
54 format!("{server}|{user}")
55 }
56 }
57 }
58
59 /// Returns the proxy server address as a string slice. For `Url` variants this
60 /// is the full URL; for `Config` variants this is the `server` field.
61 pub fn server(&self) -> &str {
62 match self {
63 Self::Url(url) => url.as_str(),
64 Self::Config { server, .. } => server.as_str(),
65 }
66 }
67}
68
69impl std::fmt::Display for Proxy {
70 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71 write!(f, "{}", self.server())
72 }
73}
74
75/// A function that determines the next proxy index given the proxy list and the current
76/// index. Implement your own to create custom rotation strategies (e.g., random,
77/// weighted, or geo-aware selection).
78pub type RotationStrategy = fn(&[Proxy], usize) -> usize;
79
80/// The default rotation strategy that cycles through proxies sequentially (0, 1, 2, ...,
81/// then back to 0). The returned index is wrapped with modulo in [`ProxyRotator::get_proxy`],
82/// so this simply returns `current + 1`.
83pub fn cyclic_rotation(_proxies: &[Proxy], current: usize) -> usize {
84 current + 1
85}
86
87const PROXY_ERROR_INDICATORS: &[&str] = &[
88 "net::err_proxy",
89 "net::err_tunnel",
90 "connection refused",
91 "connection reset",
92 "connection timed out",
93 "failed to connect",
94 "could not resolve proxy",
95];
96
97/// Returns `true` if the error message indicates a proxy-related failure. This checks
98/// for common proxy error patterns like "connection refused", "tunnel failed", etc.
99/// Useful in retry logic to decide whether to switch to a different proxy.
100pub fn is_proxy_error(error: &dyn std::error::Error) -> bool {
101 let msg = error.to_string().to_lowercase();
102 PROXY_ERROR_INDICATORS.iter().any(|ind| msg.contains(ind))
103}
104
105/// Thread-safe proxy rotator that cycles through a list of proxies using a configurable strategy.
106///
107/// The rotator holds a `Mutex`-protected index that advances each time [`get_proxy()`](Self::get_proxy)
108/// is called. Duplicate proxies are rejected at construction time to prevent wasted cycles.
109/// The default strategy is [`cyclic_rotation`], but you can supply any function matching
110/// the [`RotationStrategy`] signature.
111pub struct ProxyRotator {
112 proxies: Vec<Proxy>,
113 strategy: RotationStrategy,
114 current_index: Mutex<usize>,
115}
116
117impl ProxyRotator {
118 /// Creates a new rotator with the default [`cyclic_rotation`] strategy. Returns
119 /// an error if the proxy list is empty or contains duplicates.
120 pub fn new(proxies: Vec<Proxy>) -> crate::error::Result<Self> {
121 Self::with_strategy(proxies, cyclic_rotation)
122 }
123
124 /// Creates a new rotator with a custom rotation strategy. The strategy function
125 /// receives the full proxy list and the current index, and returns the next index.
126 /// Returns an error if the proxy list is empty or contains duplicates.
127 pub fn with_strategy(
128 proxies: Vec<Proxy>,
129 strategy: RotationStrategy,
130 ) -> crate::error::Result<Self> {
131 if proxies.is_empty() {
132 return Err(FetchError::InvalidProxy(
133 "at least one proxy must be provided".into(),
134 ));
135 }
136 // Validate uniqueness
137 let mut seen = std::collections::HashSet::new();
138 for p in &proxies {
139 let key = p.key();
140 if !seen.insert(key.clone()) {
141 return Err(FetchError::InvalidProxy(format!("duplicate proxy: {key}")));
142 }
143 }
144 Ok(Self {
145 proxies,
146 strategy,
147 current_index: Mutex::new(0),
148 })
149 }
150
151 /// Returns the next proxy according to the rotation strategy and advances the
152 /// internal index. The index is taken modulo the proxy count, so strategies can
153 /// return any value without worrying about bounds.
154 pub fn get_proxy(&self) -> Proxy {
155 let mut idx = self.current_index.lock().unwrap();
156 let actual = *idx % self.proxies.len();
157 let proxy = self.proxies[actual].clone();
158 *idx = (self.strategy)(&self.proxies, actual);
159 proxy
160 }
161
162 /// Returns a slice of all configured proxies. Useful for logging or diagnostics.
163 pub fn proxies(&self) -> &[Proxy] {
164 &self.proxies
165 }
166
167 /// Returns the number of proxies in the rotator. Always at least 1 since empty
168 /// proxy lists are rejected at construction time.
169 pub fn len(&self) -> usize {
170 self.proxies.len()
171 }
172
173 /// Returns `true` if the rotator contains no proxies. In practice this always
174 /// returns `false` since the constructor requires at least one proxy, but this
175 /// method is provided for API completeness alongside [`len()`](Self::len).
176 pub fn is_empty(&self) -> bool {
177 self.proxies.is_empty()
178 }
179}
180
181impl std::fmt::Debug for ProxyRotator {
182 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
183 f.debug_struct("ProxyRotator")
184 .field("count", &self.proxies.len())
185 .finish()
186 }
187}